# EDA Sentiment Analyses 

In [5]:
# Importing the Required Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob
from nltk.corpus import stopwords 
from collections import Counter
import warnings; warnings.simplefilter('ignore')
import nltk
import re
import string
from string import punctuation 
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer 

from sklearn.preprocessing import LabelEncoder 
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

stop_words = set(stopwords.words('english'))
punctuation = punctuation + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&o' 


In [6]:
#UCI ML drug dataset

#Importing the datasets and libraries 

df_train = pd.read_csv("drugsComTrain_raw.csv")
df_test = pd.read_csv("drugsComTest_raw.csv") 

print ("The shape of the train set given is : ", df_train.shape)
print ("The shape of the test set given is : ", df_test.shape)

df_train.head() #printing the first 5 rows of the train set

The shape of the train set given is :  (161297, 7)
The shape of the test set given is :  (53766, 7)


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [8]:
# Data types
print(df_train.dtypes)  # data types of each column in the dataset  

uniqueID        int64
drugName       object
condition      object
review         object
rating          int64
date           object
usefulCount     int64
dtype: object


In [None]:
#Merging the test and train data 
merge = [df_train, df_test]
df_data = pd.concat(merge)

print (df_data.shape)

df_data.head(10)

In [None]:
df_data.describe()

In [None]:
# Null values
print ("Null values in the dataset : ", df_data.isnull().sum(axis = 0))

In [None]:
df_data.describe() 

In [None]:
# Calculating what percentage of data is null 

size = df_data.shape[0] 

print("Total Size of the dataset : ", size)  

total_na = df_data.isnull().sum(axis=0)['condition'] 
print("Null values", total_na) 

print("Percentage of null values : ", (total_na/size)*100)  

In [None]:
# Dropping the data points with null values as it's very much less than 5% of the total data points and also the data is not time series data 
# so we can drop the data points with null values without any problem 
df_data  = df_data.dropna(how='any',axis=0)  
print("The shape of the data after dropping the null values is : ",df_data.shape) # printing the shape of the data after dropping the null values 

In [None]:
# lowercasing the column names so it will be easier to work with them 
df_data.columns = df_data.columns.str.lower()  

In [None]:
df_data.columns

In [None]:
# Sorting the dataframe by uniqueID 
df_data = df_data.sort_values(by=['uniqueid'])  
df_data.reset_index(inplace=True, drop=True) 
df_data.head(10) 

In [None]:
# Total unique conditions in the dataset 
print(df_data['condition'].nunique(), "\n") 
print("some of the conditions are: \n", df_data['condition'].unique()[0:10], "\n")  

In [None]:
# top 10 drugs with rating equals 1  
df_data[df_data['rating']==1]['drugname'].value_counts().head(10) 

In [None]:
df_data.loc[df_data.usefulcount==0, 'drugname'].value_counts()[:10].plot(kind='barh', figsize=(10,5), color='green') 
plt.title('Top 10 drugs with no useful count') 
plt.xlabel('Count')
plt.ylabel('Drug Name')
plt.show() 

In [None]:
# minimum rating in the dataset 
min_rating = min(df_data['rating']) 
print(min_rating)  

In [None]:
# Converting the date in to date time format 
df_data['date'] = pd.to_datetime(df_data['date']) 

# Exploratory Data Analysis 

In [None]:
# This barplot shows the top 20 drugs with the 10/10 rating

# Setting the Parameters
sns.set(font_scale = 1.2, style = 'darkgrid')
plt.rcParams['figure.figsize'] = [15, 8]

rating = dict(df_data.loc[df_data.rating == 10, "drugname"].value_counts())
drugname = list(rating.keys())
drug_rating = list(rating.values())

sns_rating = sns.barplot(x = drugname[0:20], y = drug_rating[0:20])

sns_rating.set(title = 'Top 20 drugs with 10/10 rating', ylabel = 'Number of Ratings', xlabel = "Drug Names")
plt.setp(sns_rating.get_xticklabels(), rotation=90); # Rotating the x-axis labels to 90 degrees 

In [None]:
# This barplot shows the Top 20 drugs with the 1/10 rating 

# setting parameter 
sns.set(font_scale=1.5, style="whitegrid") 
plt.rcParams['figure.figsize'] = (20, 10) 

rating = dict(df_data.loc[df_data.rating==1, 'drugname'].value_counts()) 

drugname = list(rating.keys()) 

sns_rating = sns.barplot(x=drugname[:20], y=list(rating.values())[:20], palette="Reds_d") 
sns_rating.set_xticklabels(sns_rating.get_xticklabels(), rotation=90) 
sns_rating.set_title("Top 20 drugs with the 1/10 rating") 
sns_rating.set_ylabel("Number of ratings") 
sns_rating.set_xlabel("Drug name") 

In [None]:
# A counplot of the ratings so we can see the distribution of the ratings in the dataset 
plt.rcParams['figure.figsize'] = (15, 9) # setting the figure size 
sns.set(style = 'darkgrid', font_scale = 1.3) # setting the style of the plot and the font size 
fig, ax = plt.subplots(1, 2) # creating a figure and axes object  

sns_1 = sns.countplot(df_data['rating'], palette='magma', order=list(range(10, 0, -1)), ax=ax[0]) # creating a countplot for the ratings 
sns_1.set_title('Countplot of the ratings', fontsize=20) # setting the title of the plot 
sns_1.set_xlabel('Ratings', fontsize=16) # setting the x label of the plot

#sns_2 = sns.displot(df_data['rating']) # creating a distplot for the ratings 
#sns_2.set_title('Distplot of the ratings', fontsize=20) # setting the title of the plot 
#sns_2.set_xlabel('Ratings', fontsize=16) # setting the x label of the plot 
