In [None]:
! pip install tldextract

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict,Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from tldextract import extract
import re

In [None]:
#nltk libraries
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stop=set(stopwords.words('english'))

In [None]:
train_df = pd.read_csv('../input/zsdataset/train.csv', encoding = 'ISO-8859-1')
test_df = pd.read_csv('../input/zsdataset/test.csv', encoding = 'ISO-8859-1')

Take a look into number of columns and rows of dataset

In [None]:
print('The shape of training dataset is = {}'.format(train_df.shape))
print('The shape of testing dataset is = {}'.format(test_df.shape))

Exploring the training and testing datasets

In [None]:
#info of training dataset
train_df.info()

In [None]:
#info of testing dataset
test_df.info()

Drop the duplicate values in training dataset

In [None]:
train_df = train_df.drop_duplicates()
train_df.shape

Check one last column of testing dataframe 'Unnamed: 9', because as per above test dataset info, it has only 1 non null count, and it won't be adding any information, so there is no harm in dropping it. Also check out the index in test_df, it just showing the index number in test dataset, as of now we will remove it. 

In [None]:
print(test_df[['Unnamed: 9']].head())
test_df.drop(columns = ['Index','Unnamed: 9'],inplace = True)
test_df.head()

In [None]:
test_df.info()

Now looking at NULL and NAN values, and before that we will add up train and test dataset and we will put a column named 'Patient_Tag' in test dataset and fill it with -1. 

In [None]:
test_df_1 = test_df.copy()
test_df_1['Patient_Tag'] = -1
df = pd.concat([train_df, test_df_1])

In [None]:
#Null values
null = df.isnull().sum().sort_values(ascending =True)
#percentage missing
percentage_missing = ((df.isnull().sum()/df.shape[0])*100).sort_values(ascending= False)
percentage_missing

In [None]:
missing_data = pd.concat([null,percentage_missing],axis = 1,keys = ['Total missing', 'Percent missing'])
missing_data

Lets check heatmap of NULL values. Also note that, one value of TRANS_CONV_TEXT is missing. 

In [None]:
sns.heatmap(df.isnull(), yticklabels=False)

Checkout the Patient Tag

In [None]:
train_df.Patient_Tag.value_counts()

In [None]:
class_df = train_df.groupby('Patient_Tag').count()['TRANS_CONV_TEXT'].reset_index().sort_values(by = 'TRANS_CONV_TEXT', ascending = False)
class_df.style.background_gradient(cmap = 'winter')

In [None]:
percent_class=class_df.TRANS_CONV_TEXT
labels= class_df.Patient_Tag

colors = ['#25C38B','#F92725']

pie,_,_ = plt.pie(percent_class,radius = 1.0,labels=labels,colors=colors,autopct="%.1f%%")
plt.setp(pie, width=0.6, edgecolor='grey') 
plt.show()

Let's explore the text data

First we will clean all the text data

In [None]:
def clean_text(x):
    normalizedsentense = x.lower()
    text = re.sub(r"[^a-z']+", ' ', normalizedsentense)
    return text
train_df['TRANS_CONV_TEXT']=train_df['TRANS_CONV_TEXT'].astype(str)
test_df['TRANS_CONV_TEXT']=test_df['TRANS_CONV_TEXT'].astype(str)
train_df['TRANS_CONV_TEXT']=train_df['TRANS_CONV_TEXT'].apply(lambda x:clean_text(x))
test_df['TRANS_CONV_TEXT']=test_df['TRANS_CONV_TEXT'].apply(lambda x:clean_text(x))

In [None]:
df['TRANS_CONV_TEXT']

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))

text_len=train_df[train_df['Patient_Tag']== 0]['TRANS_CONV_TEXT'].str.len()
ax1.hist(text_len,color='#25C38B')
ax1.set_title('Patient Tag 0')

text_len=train_df[train_df['Patient_Tag']== 1]['TRANS_CONV_TEXT'].str.len()
ax2.hist(text_len,color='#F92725')
ax2.set_title('Patient Tag 1')

fig.suptitle('Characters in text')
plt.show()

It can be seen from the graph that, the patient tag 0 does have, max length of text is upto 16000, whereas the patient tag 1 has max lenth of text is up to 12000

Let's check number of words in Text

In [None]:
train_df_1 = train_df.copy()
train_df_1['text'] = train_df_1.TRANS_CONV_TEXT
train_df_1["text"] = train_df_1["text"].astype(str)

fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))

text_len=train_df_1[train_df_1['Patient_Tag']==0]['text'].str.split().map(lambda x: len(x))
ax1.hist(text_len,color='#17C37B')
ax1.set_title('Patient Tag 0')

text_len=train_df_1[train_df_1['Patient_Tag']==1]['text'].str.split().map(lambda x: len(x))
ax2.hist(text_len,color='#F92969')
ax2.set_title('Patient Tag 1')

fig.suptitle('Words in Text')
plt.show()

Let's check some most common words in text

In [None]:
def create_corpus(target):
    corpus=[]
    
    for x in train_df_1[train_df_1['Patient_Tag']==target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
#check stop words, and we will remove them while counting all other words
np.array(stop)

In [None]:
corpus=create_corpus(0)
counter=Counter(corpus)
most=counter.most_common()
x=[]
y=[]
for word,count in most[:50]:
    if (word.lower() not in stop) :
        x.append(word)
        y.append(count)
sns.barplot(x=y,y=x)

In [None]:
corpus=create_corpus(1)
counter=Counter(corpus)
most=counter.most_common()
x=[]
y=[]
for word,count in most[:50]:
    if (word.lower() not in stop) :
        x.append(word)
        y.append(count)
sns.barplot(x=y,y=x)

It seems it's all about heart, blood, failure and risks. : -p

Let's make some word cloud

In [None]:
train_df_2 = train_df.copy()
train_df_2['text'] = train_df_2.TRANS_CONV_TEXT
train_df_2["text"] = train_df_2["text"].astype(str)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[20, 10])

train_df_2_0 = train_df_2[train_df_2["Patient_Tag"]==0]
train_df_2_1 = train_df_2[train_df_2["Patient_Tag"]==1]


comment_words = '' 
stopwords = set(STOPWORDS) 

for val in train_df_2_0.text: 
      
    # typecaste each val to string 
    val = str(val) 
  
    # split the value 
    tokens = val.split() 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "

wordcloud1 = WordCloud(width = 800, height = 800, 
                background_color ='white',
                colormap="Greens",
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 

ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Patient_Tag 0',fontsize=35);

comment_words = ''

for val in train_df_2_1.text: 
      
    # typecaste each val to string 
    val = str(val) 
  
    # split the value 
    tokens = val.split() 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "



wordcloud2 = WordCloud(width = 800, height = 800, 
                background_color ='white',
                colormap="Reds",
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words)  
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Patient_Tag 1',fontsize=35);


Now let's explore, sources

In [None]:
train_df_3 = train_df.copy()
train_df_3['Source'].value_counts()

In [None]:
test_df['Source'].value_counts()

Make Facebook and FECEBOOK both are same in both train and test dataset

In [None]:
def classes_def(x):
    if x ==  "FACEBOOK":
        return "Facebook"
    elif x == 'FORUMS':
        return 'Forums'
    elif x == 'BLOG':
        return 'Blog'
    elif x == 'YOUTUBE':
        return 'Youtube'
    else:
        return 'Facebook'
train_df['Source']=train_df['Source'].apply(lambda x:classes_def(x))
test_df['Source']=test_df['Source'].apply(lambda x:classes_def(x))

In [None]:
source_data = train_df['Source'].value_counts()
source_data

In [None]:
fig, ax = plt.subplots(1,1, figsize=(7, 6))
source_data = train_df['Source'].value_counts()
ax.bar(source_data.index, source_data, width = 0.35, edgecolor = 'white',linewidth=0.3,color = '#A1D539')
for i in source_data.index:
    ax.annotate(f"{source_data[i]}", 
                   xy=(i, source_data[i] +10),
                   va = 'center', ha='center',fontweight='heavy', fontfamily='roman',
                   color='#F92725')
    
for s in ['top','right']:
    ax.spines[s].set_visible(False)

Now look at the hosts

As we saw earlier, there are null values in host but there are no null values in link. So we will copy the link directly and extract the host name where, there is null value in host, we will do this in both train and test datasets

In [None]:
train_df["Host"].fillna(train_df["Link"], inplace=True)
test_df["Host"].fillna(test_df["Link"], inplace=True)

In [None]:
#extracts the url from the link, we will extract hostname only
def extract_url(x):
    tsd, td, tsu = extract(x) # prints abc, hostname, com
    return td

In [None]:
train_df['Host']=train_df['Host'].apply(lambda x:extract_url(x))
train_df['Host'].value_counts().sort_values(ascending= False)

We see there is one entry with 65 values is an empty string, if we check in the data, they are from the host Youtube. So we will manually put the host name youtube.

In [None]:
test_df['Host']=test_df['Host'].apply(lambda x:extract_url(x))
test_df['Host'].value_counts().sort_values(ascending= False)

In [None]:
#train_df['Host'] = train_df['Host'].apply(lambda 'unknown' : train_df[train_df['Host'] == '']
train_df.loc[train_df.Host == '' , 'Host'] = 'youtube'

In [None]:
host = train_df['Host'].value_counts().nlargest(n=10)
fig, ax = plt.subplots(1,1, figsize=(15, 4))

ax.bar(host.index, host, width = 0.5, edgecolor = 'white',linewidth=0.3)
for i in host.index:
    ax.annotate(f"{host[i]}", 
                   xy=(i, host[i] + 5),
                   va = 'center', ha='center',fontweight='heavy', fontfamily='serif',
                   color='#0B0B0B')
    
for s in ['top','left','right','bottom']:
    ax.spines[s].set_visible(False)

In [None]:
host_test = test_df['Host'].value_counts().nlargest(n=10)
fig, ax = plt.subplots(1,1, figsize=(15, 5))

ax.bar(host_test.index, host, width = 0.5, edgecolor = 'white',linewidth=0.3, color = '#EAA1F4')
for i in host_test.index:
    ax.annotate(f"{host_test[i]}", 
                   xy=(i, host_test[i] + 10),
                   va = 'center', ha='center',fontweight='heavy', fontfamily='serif',
                   color='#0B0B0B')
    
for s in ['top','left','right','bottom']:
    ax.spines[s].set_visible(False)

Also, there is no meaning to keep links in data, as we already keep host name in the dataset. Keeping URL  will not add any difference. 

In [None]:
train_df = train_df.drop(['Link'], axis=1)
test_df = test_df.drop(['Link'],axis = 1)

Now look at date and time

The Eastern Time Zone (ET) is an area 5 hours behind Greenwich Mean Time (GMT-5) during the winter months (referred to as Eastern Standard Time or EST) and 4 hours behind Greenwich Mean Time (GMT-4) during the summer months (referred to as Eastern Daylight Time or EDT).

In [None]:
date_time = train_df[['Date(ET)','Time(ET)','time(GMT)']]
date_time

As it can be seen that, ET is 5 hours behind than GMT. Both columns Time(ET) and time(GMT) finally gives a time, which is same. There is no meaning to keep both columns, so we will remove one time(GMT) column. Another reason to remove time(GMT) is, it has 252 missing values.

In [None]:
train_df = train_df.drop(['time(GMT)'], axis=1)
test_df = test_df.drop(['time(GMT)'],axis = 1)

In [None]:
#train_df['Date(ET)'] =  pd.to_datetime(train_df['Date(ET)'],format='%dd-%mm-%yyyy')
train_df['Date(ET)'] = pd.to_datetime(train_df['Date(ET)'])

In [None]:
train_df['Time(ET)'] = pd.to_datetime(train_df['Time(ET)'],errors='coerce').dt.time
test_df['Time(ET)'] = pd.to_datetime(test_df['Time(ET)'],errors='coerce').dt.time

Due to COERCE, where there is time like 0.87 or 0.47 it will be converted to NAT and we will put the median time on their place. 

In [None]:
test_df['Time(ET)'].isna().sum()

In [None]:
train_df['Time(ET)'].isna().sum()

In [None]:
temp_train_df = train_df.dropna()
temp_test_df = test_df.dropna()

At the end we can add one more feature according to time and date. 

In [None]:
temp_train_df.loc[:,'hour'] = pd.to_datetime(temp_train_df['Time(ET)'], format='%H:%M:%S')
temp_train_df.loc[:,'hour'] = temp_train_df['hour'].dt.hour

In [None]:
prods = pd.DataFrame({'hour':range(1, 25)})
b = [0,4,8,12,16,20,24]
l = ['Late Night', 'Early Morning','Morning','Noon','Eve','Night']
prods['session'] = pd.cut(prods['hour'], bins=b, labels=l, include_lowest=True)
def f(x):
    if (x > 6) and (x <= 12):
        return 'Morning'
    elif (x > 12) and (x <= 18 ):
        return 'Noon'
    elif (x > 18) and (x <= 24):
        return'Evening'
    elif (x > 0) and (x <= 6) :
        return 'Night'
temp_train_df['hour'] = temp_train_df['hour'].apply(f)
temp_train_df

In [None]:
fig, ax = plt.subplots(1,1, figsize=(7, 6))
hourly_data = temp_train_df['hour'].value_counts()
ax.bar(hourly_data.index, hourly_data, width = 0.35, edgecolor = 'white',linewidth=0.3,color = '#C1B539')
for i in hourly_data.index:
    ax.annotate(f"{hourly_data[i]}", 
                   xy=(i, hourly_data[i] +10),
                   va = 'center', ha='center',fontweight='heavy', fontfamily='roman',
                   color='#F58624')
    
for s in ['top','right']:
    ax.spines[s].set_visible(False)

# Model Generation

First we will make a model with Text data only, then afterwords we can add more features in them