# EDA Sentiment Analyses 

In [None]:
# Importing the Required Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob
import nltk.corpus as stopwords 
from collections import Counter
import warnings; warnings.simplefilter('ignore')
import nltk
import re
import string
from string import punctuation 
from nltk.stem import SnowballStemmer 
from nltk.corpus import stopwords 

from sklearn.preprocessing import LabelEncoder   
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

stop_words = set(stopwords.words('english'))
punctuation = punctuation + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&o' 

In [None]:
#UCI ML drug dataset

#Importing the datasets and libraries 

df_train = pd.read_csv("drugsComTrain_raw.csv")
df_test = pd.read_csv("drugsComTest_raw.csv") 

print ("The shape of the train set given is : ", df_train.shape)
print ("The shape of the test set given is : ", df_test.shape)

df_train.head() #printing the first 5 rows of the train set

In [None]:
# Data types
print(df_train.dtypes)  # data types of each column in the dataset  

In [None]:
#Merging the test and train data 
merge = [df_train, df_test]
df_data = pd.concat(merge)

print (df_data.shape)

df_data.head(10) 

In [None]:
df_data.describe() 

In [None]:
# Null values
print ("Null values in the dataset : ", df_data.isnull().sum(axis = 0)) 

In [None]:
df_data.describe() 

In [None]:
# Calculating what percentage of data is null 

size = df_data.shape[0] 

print("Total Size of the dataset : ", size)  

total_na = df_data.isnull().sum(axis=0)['condition'] 
print("Null values", total_na) 

print("Percentage of null values : ", (total_na/size)*100)  

In [None]:
# Dropping the data points with null values as it's very much less than 5% of the total data points and also the data is not time series data 
# so we can drop the data points with null values without any problem 
df_data  = df_data.dropna(how='any',axis=0)  
print("The shape of the data after dropping the null values is : ",df_data.shape) # printing the shape of the data after dropping the null values 

In [None]:
# lowercasing the column names so it will be easier to work with them 
df_data.columns = df_data.columns.str.lower()  

In [None]:
df_data.columns

In [None]:
# Sorting the dataframe by uniqueID 
df_data = df_data.sort_values(by=['uniqueid'])  
df_data.reset_index(inplace=True, drop=True) 
df_data.head(10) 

In [None]:
# Total unique conditions in the dataset 
print(df_data['condition'].nunique(), "\n") 
print("some of the conditions are: \n", df_data['condition'].unique()[0:10], "\n")  

In [None]:
# top 10 drugs with rating equals 1  
df_data[df_data['rating']==1]['drugname'].value_counts().head(10) 

In [None]:
df_data.loc[df_data.usefulcount==0, 'drugname'].value_counts()[:10].plot(kind='barh', figsize=(10,5), color='green') 
plt.title('Top 10 drugs with no useful count') 
plt.xlabel('Count')
plt.ylabel('Drug Name')
plt.show() 

In [None]:
# minimum rating in the dataset 
min_rating = min(df_data['rating']) 
print(min_rating)  

In [None]:
# Converting the date in to date time format 
df_data['date'] = pd.to_datetime(df_data['date']) 

# Exploratory Data Analysis 

In [None]:
# This barplot shows the top 20 drugs with the 10/10 rating

# Setting the Parameters
sns.set(font_scale = 1.2, style = 'darkgrid')
plt.rcParams['figure.figsize'] = [15, 8]

rating = dict(df_data.loc[df_data.rating == 10, "drugname"].value_counts())
drugname = list(rating.keys())
drug_rating = list(rating.values())

sns_rating = sns.barplot(x = drugname[0:20], y = drug_rating[0:20])

sns_rating.set(title = 'Top 20 drugs with 10/10 rating', ylabel = 'Number of Ratings', xlabel = "Drug Names")
plt.setp(sns_rating.get_xticklabels(), rotation=90); # Rotating the x-axis labels to 90 degrees 

In [None]:
# This barplot shows the Top 20 drugs with the 1/10 rating 

# setting parameter 
sns.set(font_scale=1.5, style="whitegrid") 
plt.rcParams['figure.figsize'] = (20, 10) 

rating = dict(df_data.loc[df_data.rating==1, 'drugname'].value_counts()) 

drugname = list(rating.keys()) 

sns_rating = sns.barplot(x=drugname[:20], y=list(rating.values())[:20], palette="Reds_d") 
sns_rating.set_xticklabels(sns_rating.get_xticklabels(), rotation=90) 
sns_rating.set_title("Top 20 drugs with the 1/10 rating") 
sns_rating.set_ylabel("Number of ratings") 
sns_rating.set_xlabel("Drug name")  

In [None]:
# A counplot of the ratings so we can see the distribution of the ratings in the dataset 
plt.rcParams['figure.figsize'] = (15, 9) # setting the figure size 
sns.set(style = 'darkgrid', font_scale = 1.3) # setting the style of the plot and the font size 
fig, ax = plt.subplots(1, 2) # creating a figure and axes object  

sns_1 = sns.countplot(df_data['rating'], palette='magma', order=list(range(10, 0, -1)), ax=ax[0]) # creating a countplot for the ratings 
sns_1.set_title('Countplot of the ratings', fontsize=20) # setting the title of the plot 
sns_1.set_xlabel('Ratings', fontsize=16) # setting the x label of the plot

#sns_2 = sns.displot(df_data['rating']) # creating a distplot for the ratings 
#sns_2.set_title('Distplot of the ratings', fontsize=20) # setting the title of the plot 
#sns_2.set_xlabel('Ratings', fontsize=16) # setting the x label of the plot 


In [None]:
# word cloud of the reviews with rating equal to 10  

df_rate_ten = df_data.loc[df_data['rating'] == 10, 'review'] # selecting the reviews with rating equal to 10 
k = (' '.join(df_rate_ten)) # joining all the reviews into a single string 
wordcloud = WordCloud(width = 1000, height = 500, background_color='white').generate(k) # creating the word cloud  
plt.figure(figsize=(15,8)) # setting the figure size 
plt.imshow(wordcloud, interpolation="bilinear") # displaying the word cloud  
plt.axis("off"); # removing the axis  

In [None]:
# Word cloud of the reviews with rating eqaul to 1 
df_rate_one = df_data.loc[df_data.rating ==1, 'review']  
k1 = ' '.join(df_rate_one) 
wordcloud = WordCloud(width = 800, height = 800,).generate(k1)  
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis("off"); 

In [None]:
# This barplot shows the mean rating of the drugs per year 
# The mean rating of the drugs is calculated by taking the mean of the rating of the drugs per year 

mean_rating = dict(df_data.groupby(df_data['date'].dt.year)['rating'].mean()) 
plt.rcParams['figure.figsize'] = (15, 5) 
sns.set(font_scale = 1.2, style='darkgrid') 
sns= sns.barplot(x=list(mean_rating.keys()), y=list(mean_rating.values()), color='orange') 
plt.xlabel('Year', fontsize=15) 
plt.ylabel('Mean Rating', fontsize=15) 
plt.title('Mean Rating of the Drugs per Year', fontsize=18) 

In [None]:
# This barplot show the Top 10 conditions of the people are suffering from. 

cond=dict(df_data['condition'].value_counts()) # dict of conditions and their count 
cond=dict(sorted(cond.items(), key=lambda x: x[1], reverse=True)) # sorting the dict in descending order 
cond=dict(list(cond.items())[0:10]) # slicing the dict to get top 10 conditions 
plt.figure(figsize=(10,5)) 
plt.bar(cond.keys(),cond.values(),color='green') 
plt.xticks(rotation=90) 
plt.xlabel('Conditions')
plt.ylabel('Count')
plt.title('Top 10 conditions of the people are suffering from') 
plt.show()


In [None]:
# Top 10 drugs which are used for the Top condition, that is Birth Control 
# Top 10 drugs which are used for the top condition, that is Birth Control 
import seaborn as sns 

df = df_data[df_data['condition'] == 'Birth Control']['drugname'].value_counts()[0: 10]
sns.set(font_scale = 1.5, style = 'whitegrid')
sns_ = sns.barplot(x = df.index, y = df.values, palette = 'summer')
sns_.set_xlabel('Drug Names')
sns_.set_title("Top 10 Drugs used for Birth Control")
plt.setp(sns_.get_xticklabels(), rotation = 90);   

In [None]:
#! pip upgrade seaborn # to upgrade seaborn
#! pip install seaborn==0.11.0 # to install specific version of seaborn

In [None]:
# # Distribution of the useful count 
# import seaborn as sns 
# sns.set(style="whitegrid", font_scale=1.5) 
# plt.rcParams['figure.figsize'] = [20, 10]
# sns.distplot(df_data['usefulcount'].dropna())
# Libaraies error 

In [None]:
df_data.columns

In [None]:
# This barplot shows the number of reviews per year 
df_bar = df_data['date'].dt.year.value_counts() 
df_bar = df_bar.sort_index() 
sns_=sns.barplot(x= df_bar.index, y=df_bar.values,  color='mediumaquamarine') 
sns_.set_title('Number of reviews per year', fontsize=15) 
sns_.set_xlabel('Year', fontsize=15); 

In [None]:
# Heatmap of the correlation matrix 
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.rcParams['figure.figsize'] = (12, 6) # set the figure size
corr = df_data.select_dtypes(include = ['float64', 'int64']).corr() # select the numerical columns 
sns_heat=sns.heatmap(corr, annot=True, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=True);  # plot the heatmap
plt.setp(sns_heat.get_xticklabels(), rotation=45); # rotate the x-axis labels to 45 degrees

# Unigrams 

In [None]:
# # Top 20 unigrams according to the rating
# from nltk import ngrams
# from nltk.tokenize import word_tokenize    
# from collections import Counter 
# import seaborn as sns
# import matplotlib.pyplot as plt
# import pandas as pd
# import numpy as np  

# df_ = df_data[['rating', 'review']] 
# df_['reveiew']= df_data['review'].str.replace("&#039;", "") 
# df_['review']= df_['review'].str.replace(r'[^\w\d\s]',' ') 

# df_review_5 = " ".join(df_.loc[df_.rating<=5, 'review']) 
# df_review_10 = " ".join(df_.loc[df_.rating>5, 'review']) 

# token_review_5 = word_tokenize(df_review_5) 
# token_review_10 = word_tokenize(df_review_10) 

# unigram_5 = ngrams(token_review_5, 1) 
# unigram_10 = ngrams(token_review_10, 1)

# frequency_5 = Counter(unigram_5) 
# frequency_10 = Counter(unigram_10) 

# df_5 = pd.DataFrame(frequency_5.most_common(20)) 
# df_10 = pd.DataFrame(frequency_10.most_common(20)) 

# # Barplot for top 20 unigrams for rating  <=5 

# plt.rcParams['figure.figsize'] = (15, 5) 
# fig, ax = plt.subplots(1, 2) 
# sns.set(font_scale = 1.5, style = 'whitegrid') 

# sns_5=sns.barplot(x=df_5[1], y=df_5[0], color='lightblue', ax=ax[0])
# sns_10=sns.barplot(x=df_10[1], y=df_10[0], color='lightblue', ax=ax[1])  

# # Setting axes labels 
# sns_5.set_title('Top 20 unigrams for rating <=5')  
# sns_10.set_title('Top 20 unigrams for rating >5') 
# sns_5.set_ylabel('Unigrams');


In [None]:
# # Top 20 bigrams according to the rating 
# from wordcloud import WordCloud 
# from nltk import ngrams 
# from collections import Counter 
# import matplotlib.pyplot as plt 
# import seaborn as sns
# import pandas as pd
# from textblob import TextBlob 
# from nltk.corpus import stopwords  
# from nltk.tokenize import word_tokenize 
# import nltk 
# import re 
# import string 
# import warnings 
# warnings.filterwarnings("ignore") 
# from nltk.stem import WordNetLemmatizer 
# from nltk.stem import PorterStemmer 
# from nltk.stem import SnowballStemmer 


# # Top 20 unigrams according to the rating
# df_ = df_data[['rating', 'review']]
# df_['review'] = df_data['review'].str.replace("&#039;", "")
# df_['review'] = df_['review'].str.replace(r'[^\w\d\s]',' ')

# df_review_5 = " ".join(df_.loc[df_.rating <= 5, 'review'])
# df_review_10 = " ".join(df_.loc[df_.rating > 5, 'review'])

# token_review_5 = word_tokenize(df_review_5)
# token_review_10 = word_tokenize(df_review_10) 

# token_review_10 = word_tokenize(df_review_10) 
# token_review_5 = word_tokenize(df_review_5) 


# bigrams_5 = ngrams(token_review_5, 2)
# bigrams_10 = ngrams(token_review_10, 2)

# frequency_5 = Counter(bigrams_5)
# frequency_10 = Counter(bigrams_10)

# df_5 = pd.DataFrame(frequency_5.most_common(20))
# df_10 = pd.DataFrame(frequency_10.most_common(20))

# # Barplot that shows the top 20 bigrams
# plt.rcParams['figure.figsize'] = [22,11]
# fig, ax = plt.subplots(1,2)
# sns.set(font_scale = 1.3, style = 'whitegrid')

# sns_5 = sns.barplot(x = df_5[1], y = df_5[0], color = 'red', ax = ax[0])
# sns_10 = sns.barplot(x = df_10[1], y = df_10[0], color = 'red', ax = ax[1])

# # Setting axes labels
# sns_5.set_title("Top 20 bigrams according for rating <= 5")
# sns_10.set_title("Top 20 bigrams according for rating > 5")
# sns_5.set_ylabel("bigrams");


# Preprocessing 

In [None]:
# We are going to use the threshold rating of 5 for giving the sentiment. 
# The review will have a positive sentiment (1) if rating>5 and negative sentiment otherwise. 
