In [1]:
import re
import math
import string
import pandas as pd
import numpy as np
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [27]:
df = pd.read_csv("extracted_data.csv")

In [31]:
df.isnull().sum()

Drug_Name               0
Patient_Review          0
Ratings                 0
Condition               3
Dosage                  0
Other_conditions     1848
Other_drugs_taken    1851
Benefits               50
Side_effects          212
Comments               32
dtype: int64

In [32]:
#lower the dataframe columns
df.columns = [cols.lower() for cols in df.columns ]
df.head()

Unnamed: 0,drug_name,patient_review,ratings,condition,dosage,other_conditions,other_drugs_taken,benefits,side_effects,comments
0,abilify,abilify review by 26 year old female patient,3,bipolar disorder,15mg taken daily for the period of 12 weeks,,"cymbalta, 90mg/daily",i didn't notice any benefit at all. supposedly...,a very uncomfortable inner restlessness was th...,i was prescribed abilify (15mg/daily) to assis...
1,abilify,abilify review by 29 year old female patient,8,bipolar,10mg taken 1/day for the period of 8 mos,"bipolar mood disorder, acne, asthma","zoloft, wellbutrin, clozepam, tretinoin, clind...",i had severe depression with agitation and mix...,"i became drowsy, however, with adequate sleep ...",the abilify decreased the need for daily klono...
2,abilify,abilify review by 43 year old female patient,10,depressionxiety,2 mg taken daily for the period of 3 months,anxiety,lexapro,within 1 week of taking the cocktail of abilif...,no side effects have been noticed,i take one pill of each 1st thing in the am......
3,abilify,abilify review by 50 year old female patient,2,depression not resolved with antidepressant drugs,started out at 5mg the 10 & last 15mg taken d...,"add, depression, poss. bipolar type 2, ptsd, a...","vyvance, ativan lisinopril, zantac, viville do...",while on abilify i can honestly say the depres...,but it caused memory loss and again an inciden...,i am only taking ativan & getting psychologica...
4,abilify,abilify review by 50 year old male patient,2,bipolar,2mg to start taken once daily for the period ...,anxiety,klonopin,due to the short time taking drug.,headache first morning at 4am that was relieve...,not much to tell. i was just starting treatme...


# Deriving new columns

In [33]:
# create age and gender of the patient reviweing the drug
def age(x):
    return re.findall("\d+",x)[0]

def gender(x):
    return 'female' if 'female' in x else 'male'

df['age'] = df['patient_review'].apply(lambda x:age(x))
df['gender'] = df['patient_review'].apply(lambda x:gender(x))

# get ratings sentiment
def rating_sentiment(x):
    if x<4:
        return "negative"
    elif 4<=x<=6:
        return "neutral"
    else:
        return "positive"
    
df['rating_sentiment'] = df['ratings'].apply(lambda x:rating_sentiment(x))

In [37]:
df[['ratings','rating_sentiment']].head(10)

Unnamed: 0,ratings,rating_sentiment
0,3,negative
1,8,positive
2,10,positive
3,2,negative
4,2,negative
5,6,neutral
6,8,positive
7,2,negative
8,8,positive
9,7,positive


# Pre Processing Techniques

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
def contraction(x):
    return contractions.fix(str(x))

columns = ['benefits','side_effects','comments']

for col in columns:
    df[col+'_new'] = df[col].apply(contraction)

In [8]:
print("Before: \n",df['benefits'][0])
print("\n\nAfter: \n",df['benefits_new'][0])

Before: 
 i didn't notice any benefit at all. supposedly, it was to help keep my mood balanced.  more than anything, i was focused on the very apparent and prevalent side effects.


After: 
 i did not notice any benefit at all. supposedly, it was to help keep my mood balanced.  more than anything, i was focused on the very apparent and prevalent side effects.


In [9]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

In [10]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [None]:
for col in columns:
    if df[col].dtype == 'object':
        # 1. Lowercasing
        df[col+'_new'] = df[col+'_new'].str.lower()
        
        #2. Remove Punctuations
        df[col+'_new'] = df[col+'_new'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ' , str(x)))
        
        #3. Remove extra whitespaces
        df[col+'_new'] = df[col+'_new'].apply(lambda x: re.sub(' +', ' ', x))
    
        #4. Lemmatization
        df[col+'_new'] = df[col+'_new'].apply(lambda x: lemmatize_words(x))

In [11]:
for col in columns:
    if df[col].dtype == 'object':
        # 1. Lowercasing
        df[col+'_new'] = df[col+'_new'].str.lower()
        
        #2. Remove Punctuations
        df[col+'_new'] = df[col+'_new'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ' , str(x)))
        
        #3. Remove extra whitespaces
        df[col+'_new'] = df[col+'_new'].apply(lambda x: re.sub(' +', ' ', x))
        
        #4. Remove stopwords
#         df[col+'_new'] = df[col+'_new'].apply(lambda x: remove_stopwords(x))
        # dont use as it is changing the sentiment of sentence
    
        #5. Lemmatization
        df[col+'_new'] = df[col+'_new'].apply(lambda x: lemmatize_words(x))

In [12]:
print("Before: \n",df['benefits'][0])
print("\n\nAfter: \n",df['benefits_new'][0])

Before: 
 i didn't notice any benefit at all. supposedly, it was to help keep my mood balanced.  more than anything, i was focused on the very apparent and prevalent side effects.


After: 
 i did not notice any benefit at all supposedly it wa to help keep my mood balanced more than anything i wa focused on the very apparent and prevalent side effect


In [19]:
new_df = df[['drug_name','age','gender','condition','dosage','other_conditions','other_drugs_taken','ratings','rating_sentiment','benefits_new','side_effects_new','comments_new']]

In [21]:
new_df.columns = [['drug_name','age','gender','condition','dosage','other_conditions','other_drugs_taken','ratings','rating_sentiment','benefits','side_effects','comments']]

In [23]:
new_df.to_csv("extracted_new.csv",index=False)