##PROJECT PRESENTATION LINK: https://drive.google.com/file/d/1cV-k6T9cDIx_2E6UXwaFszlrs1RM7IUC/view?usp=sharing##

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
df = pd.read_csv('/content/train.csv')

In [None]:
df.head()

Unnamed: 0,name_of_drug,review_by_patient,number_of_times_prescribed,base_score
0,Valsartan,"""It has no side effect, I take it in combinati...",27,8.022969
1,Guanfacine,"""My son is halfway through his fourth week of ...",192,7.858458
2,Lybrel,"""I used to take another oral contraceptive, wh...",17,6.341969
3,Buprenorphine / naloxone,"""Suboxone has completely turned my life around...",37,6.590176
4,Cialis,"""2nd day on 5mg started to work with rock hard...",43,6.144782


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32165 entries, 0 to 32164
Data columns (total 4 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name_of_drug                32165 non-null  object 
 1   review_by_patient           32165 non-null  object 
 2   number_of_times_prescribed  32165 non-null  int64  
 3   base_score                  32165 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1005.3+ KB


In [None]:
df.describe()

Unnamed: 0,number_of_times_prescribed,base_score
count,32165.0,32165.0
mean,27.893207,6.664261
std,35.819035,1.588699
min,0.0,0.899076
25%,6.0,6.109307
50%,16.0,6.788245
75%,36.0,7.90079
max,771.0,8.992453


In [None]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Remove punctuation and special characters
df['review_by_patient'] = df['review_by_patient'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x))


In [None]:
df

Unnamed: 0,name_of_drug,review_by_patient,number_of_times_prescribed,base_score
0,Valsartan,"""It has no side effect, I take it in combinati...",27,8.022969
1,Guanfacine,"""My son is halfway through his fourth week of ...",192,7.858458
2,Lybrel,"""I used to take another oral contraceptive, wh...",17,6.341969
3,Buprenorphine / naloxone,"""Suboxone has completely turned my life around...",37,6.590176
4,Cialis,"""2nd day on 5mg started to work with rock hard...",43,6.144782
...,...,...,...,...
32160,Cymbalta,"""I have been taking Cymbalta for 15 months now...",89,6.963020
32161,Nexplanon,"""I have had the Nexplanon since Dec. 27, 2016 ...",0,0.899076
32162,Venlafaxine,"""Had panic attacks and social anxiety starting...",25,6.241812
32163,Fluoxetine,"""I have been off Prozac for about 4 weeks now....",22,7.940428


In [None]:
for i in range(df.shape[1]):
  print(df.iloc[:,i].unique())

['Valsartan' 'Guanfacine' 'Lybrel' ... 'Benzamycin' 'Midamor' 'Didrex']
[' It has no side effect  I take it in combination of Bystolic 5 Mg and Fish Oil '
 ' My son is halfway through his fourth week of Intuniv  We became concerned when he began this last week  when he started taking the highest dose he will be on  For two days  he could hardly get out of bed  was very cranky  and slept for nearly 8 hours on a drive home from school vacation  very unusual for him   I called his doctor on Monday morning and she said to stick it out a few days  See how he did at school  and with getting up in the morning  The last two days have been problem free  He is MUCH more agreeable than ever  He is less emotional  a good thing   less cranky  He is remembering all the things he should  Overall his behavior is better    We have tried many different medications and so far this is the most effective  '
 ' I used to take another oral contraceptive  which had 21 pill cycle  and was very happy  very ligh

In [None]:
for i in range(df.shape[1]):
  print(df.iloc[:,i].value_counts())

Levonorgestrel                       724
Etonogestrel                         628
Ethinyl estradiol / norethindrone    547
Ethinyl estradiol / norgestimate     425
Nexplanon                            397
                                    ... 
Ephedrine / guaifenesin                1
Xalkori                                1
Methscopolamine                        1
Brimonidine / timolol                  1
Didrex                                 1
Name: name_of_drug, Length: 2220, dtype: int64
 Good                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [None]:
df['base_score'].value_counts()

7.170997    292
8.745991    281
6.493653    281
6.173134    260
8.074493    258
           ... 
8.530035      1
6.505984      1
7.770829      1
8.592949      1
5.874253      1
Name: base_score, Length: 1383, dtype: int64

# pre-processing

In [None]:
# Convert all text to lowercase
df['review_by_patient'] = df['review_by_patient'].apply(lambda x: x.lower())

In [None]:
# Tokenization
df['review_by_patient'] = df['review_by_patient'].apply(lambda x: x.split())

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Remove stop words
stop_words = stopwords.words('english')
df['review_by_patient'] = df['review_by_patient'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
df['review_by_patient'] = df['review_by_patient'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [None]:
# Join the tokens back into sentences
df['review_by_patient'] = df['review_by_patient'].apply(lambda x: ' '.join(x))
tfidf = TfidfVectorizer()

In [None]:
df

Unnamed: 0,name_of_drug,review_by_patient,number_of_times_prescribed,base_score
0,Valsartan,side effect take combination bystolic 5 mg fis...,27,8.022969
1,Guanfacine,son halfway fourth week intuniv became concern...,192,7.858458
2,Lybrel,used take another oral contraceptive 21 pill c...,17,6.341969
3,Buprenorphine / naloxone,suboxone completely turned life around feel he...,37,6.590176
4,Cialis,2nd day 5mg started work rock hard erection ho...,43,6.144782
...,...,...,...,...
32160,Cymbalta,taking cymbalta 15 month first 30mg six month ...,89,6.963020
32161,Nexplanon,nexplanon since dec 27 2016 got first period e...,0,0.899076
32162,Venlafaxine,panic attack social anxiety starting adolescen...,25,6.241812
32163,Fluoxetine,prozac 4 week starting 2 year ago terrible dep...,22,7.940428


# TFDIF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

nlp = spacy.load("en_core_web_sm")



In [None]:
#With TFIDFVectorizer
print("\n\nWith TFIDFVectorizer")
vectorizer = TfidfVectorizer(max_features=100)
X = vectorizer.fit_transform(df.review_by_patient)
print(vectorizer.get_feature_names_out())
print(X.toarray())
print("\n")



With TFIDFVectorizer
['039' '10' 'acne' 'ago' 'almost' 'also' 'amp' 'anxiety' 'away' 'back'
 'bad' 'better' 'birth' 'control' 'could' 'cramp' 'day' 'depression'
 'doctor' 'dose' 'drug' 'effect' 'even' 'ever' 'every' 'experience' 'far'
 'feel' 'feeling' 'felt' 'first' 'get' 'getting' 'go' 'going' 'good' 'got'
 'great' 'headache' 'help' 'hour' 'know' 'last' 'life' 'like' 'little'
 'lot' 'made' 'make' 'medication' 'medicine' 'mg' 'month' 'mood' 'much'
 'nausea' 'never' 'night' 'normal' 'nothing' 'one' 'pain' 'period' 'pill'
 'prescribed' 'problem' 'put' 'quot' 'really' 'say' 'severe' 'sex' 'side'
 'since' 'skin' 'sleep' 'started' 'still' 'symptom' 'take' 'taking'
 'thing' 'think' 'thought' 'time' 'took' 'tried' 'try' 'two' 'use' 'used'
 'using' 'week' 'weight' 'well' 'went' 'work' 'worked' 'would' 'year']
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.12149158 0.         0.         ...

In [None]:
#With TFIDFVectorizer and removing stop words
print("\n\nWith TFIDFVectorizer and removing stop words")
vectorizer = TfidfVectorizer(stop_words=nlp.Defaults.stop_words,max_features=100)
X1 = vectorizer.fit_transform(df.review_by_patient)
print(vectorizer.get_feature_names_out())
print(X.toarray())
print("\n")



With TFIDFVectorizer and removing stop words


InvalidParameterError: ignored

In [None]:
y = df['base_score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
import sklearn
sklearn.linear_model.LinearRegression()
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

40.94739445591947