In [3]:
## Disease Condition prediction based on drug reviews

In this project, we will classify the condition of patients using the reviews of the drugs given by patients using drug reviews dataset available in [UCI ML Repository](https://archive.ics.uci.edu/ml/datasets/Drug+Review+Dataset+%28Drugs.com%29).

## Importing libraries

In [4]:
import pandas as pd # data preprocessing
import itertools # confusion matrix
import string
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
# To show all the rows of pandas dataframe
pd.set_option('display.max_rows', None)

In [5]:
!pip install BeautifulSoup4 



In [6]:
str1="I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations."


lst = [str1]
lst

['I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations.']

In [7]:
import nltk
import sklearn
import bs4
print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The bs4 version is {}.'.format(bs4.__version__))

The nltk version is 3.8.1.
The scikit-learn version is 1.2.2.
The bs4 version is 4.12.2.


In [8]:
df=pd.read_csv('data\drugsComTrain_raw.csv')

In [9]:
df.to_csv('data/drugsComTrain.csv',index=False)

In [10]:
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [11]:
df.condition.value_counts().shape

(884,)

In [12]:
#top_conditions.sum()

In [13]:
null_counts = df.isnull().sum()
null_counts

uniqueID         0
drugName         0
condition      899
review           0
rating           0
date             0
usefulCount      0
dtype: int64

In [14]:
# Assuming you have a DataFrame named df

# Remove rows with null values in the "condition" column
df = df.dropna(subset=['condition'])

# Now, df does not contain rows with null values in the "condition" column


In [15]:
df = df.dropna()

In [16]:
df.condition.value_counts()

condition
Birth Control                                                          28788
Depression                                                              9069
Pain                                                                    6145
Anxiety                                                                 5904
Acne                                                                    5588
Bipolar Disorde                                                         4224
Insomnia                                                                3673
Weight Loss                                                             3609
Obesity                                                                 3568
ADHD                                                                    3383
Diabetes, Type 2                                                        2554
Emergency Contraception                                                 2463
High Blood Pressure                                               

In [17]:
df.condition.value_counts().shape

(884,)

In [18]:
##df_train = df[(df['condition']=='Birth Control') | (df['condition']=='Depression') | (df['condition']=='High Blood Pressure')|(df['condition']=='Diabetes, Type 2')]

In [19]:
all_conditions = df['condition'].unique()
df_train = df[df['condition'].isin(all_conditions)]


In [20]:
df.shape

(160398, 7)

In [21]:
df_train.shape

(160398, 7)

In [22]:
X = df_train.drop(['uniqueID','drugName','rating','date','usefulCount'],axis=1)

In [23]:
#plots

In [24]:
X.head()

Unnamed: 0,condition,review
0,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati..."
1,ADHD,"""My son is halfway through his fourth week of ..."
2,Birth Control,"""I used to take another oral contraceptive, wh..."
3,Birth Control,"""This is my first time using any form of birth..."
4,Opiate Dependence,"""Suboxone has completely turned my life around..."


## data preprocessing

In [25]:
X['review'][2]

'"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, because the ingredients are similar. When my other pills ended, I started Lybrel immediately, on my first day of period, as the instructions said. And the period lasted for two weeks. When taking the second pack- same two weeks. And now, with third pack things got even worse- my third period lasted for two weeks and now it&#039;s the end of the third week- I still have daily brown discharge.\r\nThe positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas."'

In [26]:
X['review'][11]

'"I have taken anti-depressants for years, with some improvement but mostly moderate to severe side affects, which makes me go off them.\r\n\r\nI only take Cymbalta now mostly for pain.\r\n\r\nWhen I began Deplin, I noticed a major improvement overnight. More energy, better disposition, and no sinking to the low lows of major depression. I have been taking it for about 3 months now and feel like a normal person for the first time ever. Best thing, no side effects."'

In [27]:
for i, col in enumerate(X.columns):
    X.iloc[:, i] = X.iloc[:, i].str.replace('"', '')

In [28]:
# To set the width of the column to maximum
#pd.set_option('max_colwidth', -1)

In [29]:
X.head()

Unnamed: 0,condition,review
0,Left Ventricular Dysfunction,"It has no side effect, I take it in combinatio..."
1,ADHD,My son is halfway through his fourth week of I...
2,Birth Control,"I used to take another oral contraceptive, whi..."
3,Birth Control,This is my first time using any form of birth ...
4,Opiate Dependence,Suboxone has completely turned my life around....


### What are stopwords ?

Stopwords are the most common words in any natural language. For the purpose of building NLP models, these stopwords might not add much value to the meaning of the document.

The most common words used in a text are **“the”, “is”, “in”, “for”, “where”, “when”, “to”, “at”** etc.

In [30]:
from nltk.corpus import stopwords

stop = stopwords.words('english')


In [31]:
stop

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

## Lemmitization
Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma .

In [32]:
#from IPython.display import Image
#Image(filename='stem.JPG')

In [33]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

porter = PorterStemmer()

lemmatizer = WordNetLemmatizer()

In [34]:
print(porter.stem("sportingly"))
print(porter.stem("very"))
print(porter.stem("troubled"))

sportingli
veri
troubl


In [35]:
from nltk.stem import WordNetLemmatizer
import zipfile

In [36]:
print(lemmatizer.lemmatize("sportingly"))
print(lemmatizer.lemmatize("very"))
print(lemmatizer.lemmatize("troubled"))

sportingly
very
troubled


In [37]:
'''import nltk
nltk.download('omw-1.4')'''

"import nltk\nnltk.download('omw-1.4')"

In [38]:
from bs4 import BeautifulSoup
import re

In [39]:
def review_to_words(raw_review):
    # 1. Delete HTML 
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    # 2. Make a space
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # 3. lower letters
    words = letters_only.lower().split()
    # 5. Stopwords 
    meaningful_words = [w for w in words if not w in stop]
    # 6. lemmitization
    lemmitize_words = [lemmatizer.lemmatize(w) for w in meaningful_words]
    # 7. space join words
    return( ' '.join(lemmitize_words))

In [41]:
X['review_clean'] = X['review'].apply(review_to_words)

  review_text = BeautifulSoup(raw_review, 'html.parser').get_text()


In [42]:
X.head()

Unnamed: 0,condition,review,review_clean
0,Left Ventricular Dysfunction,"It has no side effect, I take it in combinatio...",side effect take combination bystolic mg fish oil
1,ADHD,My son is halfway through his fourth week of I...,son halfway fourth week intuniv became concern...
2,Birth Control,"I used to take another oral contraceptive, whi...",used take another oral contraceptive pill cycl...
3,Birth Control,This is my first time using any form of birth ...,first time using form birth control glad went ...
4,Opiate Dependence,Suboxone has completely turned my life around....,suboxone completely turned life around feel he...


## Creating features and Target Variable

In [137]:
X_feat=X['review_clean']
y=X['condition']

In [138]:
# Total conditions (number of rows in the dataset)
total_conditions = df.shape[0]

# Total unique conditions
unique_conditions = df['condition'].nunique()

# Print the results
print(f"Total conditions: {total_conditions}")
print(f"Total unique conditions: {unique_conditions}")

Total conditions: 160398
Total unique conditions: 884


In [None]:
# Filter data by high ratings and useful counts
filtered_df = df[(df['rating'] >= 9) & (df['usefulCount'] >= 100)]

# Group by condition and count unique drug names
condition_drug_counts = filtered_df.groupby('condition')['drugName'].nunique()

# Sort conditions by the number of unique drug names
top_conditions = condition_drug_counts.sort_values(ascending=False).head(20)

# Filter the original dataset to include only the top conditions
selected_conditions = top_conditions.index
final_filtered_df = df[df['condition'].isin(selected_conditions)]

# Print the top conditions and their drug counts
print("Top conditions based on drug availability:")
print(top_conditions)

# Proceed with training using final_filtered_df
X_new = final_filtered_df['review']
y_new = final_filtered_df['condition']

# Train-test split
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=0, stratify=y_new)

In [142]:
# Filter data by high ratings and useful counts
filtered_df = df[(df['rating'] >= 9) & (df['usefulCount'] >= 100)]

# Group by condition and count unique drug names
condition_drug_counts = filtered_df.groupby('condition')['drugName'].nunique()

# Sort conditions by the number of unique drug names
top_conditions = condition_drug_counts.sort_values(ascending=False).head(20)

# Filter the original dataset to include only the top conditions
selected_conditions = top_conditions.index
final_filtered_df = df[df['condition'].isin(selected_conditions)]

# Print the top conditions and their drug counts
print("Top conditions based on drug availability:")
print(top_conditions)

# Proceed with training using final_filtered_df
X = final_filtered_df['review']
y = final_filtered_df['condition']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

Top conditions based on drug availability:
condition
Depression                     63
Pain                           43
Anxiety                        42
Bipolar Disorde                31
Insomnia                       29
ibromyalgia                    28
ADHD                           26
Rheumatoid Arthritis           26
High Blood Pressure            25
Panic Disorde                  22
Obesity                        22
Osteoarthritis                 18
Muscle Spasm                   17
Acne                           16
Birth Control                  16
Diabetes, Type 2               16
Anxiety and Stress             15
Major Depressive Disorde       14
Chronic Pain                   14
Generalized Anxiety Disorde    13
Name: drugName, dtype: int64


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [143]:
# Get the value counts for the filtered top conditions
filtered_condition_counts = final_filtered_df['condition'].value_counts()

# Calculate the total number of data points for the filtered conditions
total_filtered_data_points = filtered_condition_counts.sum()

# Print the total data points and the count for each filtered condition
print(f"Total data points in the filtered top conditions: {total_filtered_data_points}")
print("\nData points per filtered condition:")
for condition, count in filtered_condition_counts.items():
    print(f"{condition}: {count}")

Total data points in the filtered top conditions: 87838

Data points per filtered condition:
Birth Control: 28788
Depression: 9069
Pain: 6145
Anxiety: 5904
Acne: 5588
Bipolar Disorde: 4224
Insomnia: 3673
Obesity: 3568
ADHD: 3383
Diabetes, Type 2: 2554
High Blood Pressure: 2321
ibromyalgia: 1791
Anxiety and Stress: 1663
Major Depressive Disorde: 1607
Panic Disorde: 1463
Chronic Pain: 1455
Muscle Spasm: 1244
Osteoarthritis: 1239
Generalized Anxiety Disorde: 1164
Rheumatoid Arthritis: 995


In [134]:
n = 20 # Replace 50 with the desired number of top conditions
label_counts = df['condition'].value_counts()
selected_labels = label_counts.head(n).index  # Select the top n conditions
filtered_df = df[df['condition'].isin(selected_labels)]

filtered_df['review_clean'] = filtered_df['review'].apply(review_to_words)

X_filtered = filtered_df['review_clean']  # Assuming 'review_clean' is your feature column
y_filtered = filtered_df['condition']
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=0, stratify=y_filtered
)

  review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['review_clean'] = filtered_df['review'].apply(review_to_words)
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [133]:
label_counts.sum()

160398

In [136]:
# Get the value counts of the 'condition' column
label_counts = df['condition'].value_counts()

# Select the top 20 conditions
top_20_conditions = label_counts.head(20)
total_top_20 = top_20_conditions.sum()

print(f"Total data points in the top 20 conditions: {total_top_20}")
# Print the condition name and its count
for condition, count in top_20_conditions.items():
    print(f"Condition: {condition}, Count: {count}")

Total data points in the top 20 conditions: 94446
Condition: Birth Control, Count: 28788
Condition: Depression, Count: 9069
Condition: Pain, Count: 6145
Condition: Anxiety, Count: 5904
Condition: Acne, Count: 5588
Condition: Bipolar Disorde, Count: 4224
Condition: Insomnia, Count: 3673
Condition: Weight Loss, Count: 3609
Condition: Obesity, Count: 3568
Condition: ADHD, Count: 3383
Condition: Diabetes, Type 2, Count: 2554
Condition: Emergency Contraception, Count: 2463
Condition: High Blood Pressure, Count: 2321
Condition: Vaginal Yeast Infection, Count: 2274
Condition: Abnormal Uterine Bleeding, Count: 2096
Condition: Bowel Preparation, Count: 1859
Condition: ibromyalgia, Count: 1791
Condition: Smoking Cessation, Count: 1780
Condition: Migraine, Count: 1694
Condition: Anxiety and Stress, Count: 1663


In [94]:
# Save the list of top 50 conditions to a text file
with open("top_20_conditions.txt", "w") as file:
    for condition in selected_labels:
        file.write(f"{condition}\n")

In [95]:
stratify=y_filtered,

In [96]:
y=filtered_df['condition']

In [97]:
print(filtered_df.columns)


Index(['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date',
       'usefulCount', 'review_clean'],
      dtype='object')


In [98]:
y.values.reshape(1,-1)
y

1                              ADHD
2                     Birth Control
3                     Birth Control
6           Emergency Contraception
7                   Bipolar Disorde
9                     Birth Control
11                       Depression
14                    Birth Control
15                          Obesity
17                      ibromyalgia
18                  Bipolar Disorde
21                         Insomnia
22                    Birth Control
24          Vaginal Yeast Infection
27                             ADHD
29                         Migraine
31                       Depression
32                             Pain
33          Vaginal Yeast Infection
37                             Pain
38                Bowel Preparation
40          Vaginal Yeast Infection
41                          Obesity
44                       Depression
47                  Bipolar Disorde
50                 Diabetes, Type 2
51                             ADHD
52                          

In [99]:
#............................no need to execute.....................!!!!!
#X_train, X_test, y_train, y_test = train_test_split(X_feat, y, test_size=0.2, random_state=0)

In [93]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### Bag of Words

In [100]:
count_vectorizer = CountVectorizer(stop_words='english')

count_train = count_vectorizer.fit_transform(X_train)

count_test = count_vectorizer.transform(X_test)

In [101]:
count_train

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 2340332 stored elements and shape (75556, 26586)>

In [102]:
from sklearn.metrics import classification_report


## Machine Learning Model : Passive Aggressive Classifier

In [103]:
all_labels = np.unique(y)

In [104]:
from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression
from sklearn.metrics import classification_report

passive = PassiveAggressiveClassifier()
passive.fit(count_train, y_train)
print("##")
pred = passive.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("##")
print("accuracy:   %0.3f" % score)


classification_rep = classification_report(y_test, pred, target_names=all_labels)

# Print the classification report
print("Classification Report:")
print(classification_rep)
cm = metrics.confusion_matrix(y_test, pred)
all_labels = np.unique(y)
#plot_confusion_matrix(cm, classes=all_labels)


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


##
##
accuracy:   0.842


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Classification Report:
                           precision    recall  f1-score   support

                     ADHD       0.82      0.91      0.87       686
Abnormal Uterine Bleeding       0.57      0.51      0.54       393
                     Acne       0.92      0.92      0.92      1094
                  Anxiety       0.72      0.69      0.70      1188
       Anxiety and Stress       0.54      0.48      0.51       328
          Bipolar Disorde       0.74      0.80      0.76       844
            Birth Control       0.94      0.95      0.94      5724
        Bowel Preparation       0.97      0.97      0.97       375
               Depression       0.72      0.75      0.73      1820
         Diabetes, Type 2       0.90      0.88      0.89       507
  Emergency Contraception       0.94      0.96      0.95       486
      High Blood Pressure       0.86      0.80      0.83       469
                 Insomnia       0.87      0.75      0.80       757
                 Migraine       0.90  

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### TFIDF

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, max_features=5000)
tfidf_train_2 = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_2 = tfidf_vectorizer.transform(X_test)

In [106]:
###### Machine Learning Model : Naive Bayes

## Machine Learning Model TFIDF

In [107]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8,max_features=5000)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

pass_tf = PassiveAggressiveClassifier()
pass_tf.fit(tfidf_train, y_train)
pred = pass_tf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = metrics.confusion_matrix(y_test, pred, labels=['Birth Control', 'Depression','Diabetes, Type 2','High Blood Pressure'])
#plot_confusion_matrix(cm, classes=['Birth Control', 'Depression','Diabetes, Type 2','High Blood Pressure'])

classification_rep = classification_report(y_test, pred, target_names=all_labels)

# Print the classification report
print("Classification Report:")
print(classification_rep)

# cm = metrics.confusion_matrix(y_test, pred)
# all_labels = np.unique(y)
# plot_confusion_matrix(cm, classes=all_labels)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


accuracy:   0.851
Classification Report:
                           precision    recall  f1-score   support

                     ADHD       0.90      0.90      0.90       686
Abnormal Uterine Bleeding       0.63      0.49      0.55       393
                     Acne       0.93      0.92      0.93      1094
                  Anxiety       0.72      0.71      0.72      1188
       Anxiety and Stress       0.58      0.42      0.49       328
          Bipolar Disorde       0.73      0.81      0.77       844
            Birth Control       0.93      0.96      0.95      5724
        Bowel Preparation       0.97      0.97      0.97       375
               Depression       0.75      0.75      0.75      1820
         Diabetes, Type 2       0.85      0.89      0.87       507
  Emergency Contraception       0.97      0.95      0.96       486
      High Blood Pressure       0.86      0.82      0.84       469
                 Insomnia       0.84      0.81      0.83       757
                 Mig

## TFIDF: Bigrams

In [108]:
tfidf_vectorizer2 = TfidfVectorizer(stop_words='english', max_df=0.8,ngram_range=(1,2), max_features=5000)
tfidf_train_2 = tfidf_vectorizer2.fit_transform(X_train)
tfidf_test_2 = tfidf_vectorizer2.transform(X_test)

In [109]:
from sklearn.metrics import classification_report

In [110]:
pass_tf = PassiveAggressiveClassifier()
pass_tf.fit(tfidf_train_2, y_train)
pred = pass_tf.predict(tfidf_test_2)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = metrics.confusion_matrix(y_test, pred, labels=['Birth Control', 'Depression','Diabetes, Type 2','High Blood Pressure'])
#plot_confusion_matrix(cm, classes=['Birth Control', 'Depression','Diabetes, Type 2','High Blood Pressure'])

classification_rep = classification_report(y_test, pred, target_names=all_labels)

# Print the classification report
print("Classification Report:")
print(classification_rep)

# cm = metrics.confusion_matrix(y_test, pred)
# all_labels = np.unique(y)
#plot_confusion_matrix(cm, classes=all_labels)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


accuracy:   0.846
Classification Report:
                           precision    recall  f1-score   support

                     ADHD       0.87      0.90      0.89       686
Abnormal Uterine Bleeding       0.62      0.56      0.59       393
                     Acne       0.92      0.92      0.92      1094
                  Anxiety       0.69      0.74      0.71      1188
       Anxiety and Stress       0.53      0.48      0.51       328
          Bipolar Disorde       0.78      0.78      0.78       844
            Birth Control       0.94      0.95      0.95      5724
        Bowel Preparation       0.96      0.97      0.97       375
               Depression       0.79      0.70      0.74      1820
         Diabetes, Type 2       0.86      0.87      0.86       507
  Emergency Contraception       0.97      0.97      0.97       486
      High Blood Pressure       0.85      0.80      0.82       469
                 Insomnia       0.83      0.83      0.83       757
                 Mig

## TFIDF : Trigrams

In [113]:
tfidf_vectorizer3 = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,3),max_features=5000)
tfidf_train_3 = tfidf_vectorizer3.fit_transform(X_train)
tfidf_test_3 = tfidf_vectorizer3.transform(X_test)

pass_tf = PassiveAggressiveClassifier()
pass_tf.fit(tfidf_train_3, y_train)
pred = pass_tf.predict(tfidf_test_3)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = metrics.confusion_matrix(y_test, pred, labels=['Birth Control', 'Depression','Diabetes, Type 2','High Blood Pressure'])
#plot_confusion_matrix(cm, classes=['Birth Control', 'Depression','Diabetes, Type 2','High Blood Pressure'])

classification_rep = classification_report(y_test, pred, target_names=all_labels)

# Print the classification report
print("Classification Report:")
print(classification_rep)

# cm = metrics.confusion_matrix(y_test, pred)
# all_labels = np.unique(y)
# plot_confusion_matrix(cm, classes=all_labels)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


accuracy:   0.846
Classification Report:
                           precision    recall  f1-score   support

                     ADHD       0.87      0.90      0.89       686
Abnormal Uterine Bleeding       0.64      0.50      0.56       393
                     Acne       0.93      0.90      0.92      1094
                  Anxiety       0.72      0.70      0.71      1188
       Anxiety and Stress       0.54      0.43      0.48       328
          Bipolar Disorde       0.75      0.79      0.77       844
            Birth Control       0.93      0.96      0.95      5724
        Bowel Preparation       0.96      0.97      0.97       375
               Depression       0.75      0.73      0.74      1820
         Diabetes, Type 2       0.88      0.86      0.87       507
  Emergency Contraception       0.96      0.96      0.96       486
      High Blood Pressure       0.85      0.81      0.83       469
                 Insomnia       0.84      0.80      0.82       757
                 Mig

In [114]:
from sklearn.metrics import classification_report

# Assuming you have the true labels y_test and predicted labels pred
classification_rep = classification_report(y_test, pred, target_names=all_labels)

# Print the classification report
print("Classification Report:")
print(classification_rep)


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Classification Report:
                           precision    recall  f1-score   support

                     ADHD       0.87      0.90      0.89       686
Abnormal Uterine Bleeding       0.64      0.50      0.56       393
                     Acne       0.93      0.90      0.92      1094
                  Anxiety       0.72      0.70      0.71      1188
       Anxiety and Stress       0.54      0.43      0.48       328
          Bipolar Disorde       0.75      0.79      0.77       844
            Birth Control       0.93      0.96      0.95      5724
        Bowel Preparation       0.96      0.97      0.97       375
               Depression       0.75      0.73      0.74      1820
         Diabetes, Type 2       0.88      0.86      0.87       507
  Emergency Contraception       0.96      0.96      0.96       486
      High Blood Pressure       0.85      0.81      0.83       469
                 Insomnia       0.84      0.80      0.82       757
                 Migraine       0.86  

In [115]:
import numpy as np
import joblib
# Quantize the model coefficients
pass_tf.coef_ = np.round(pass_tf.coef_, decimals=2)
pass_tf.intercept_ = np.round(pass_tf.intercept_, decimals=2)

# Save the quantized model
joblib.dump(pass_tf, 'model/passmodel_quantized_compressed.pkl', compress=3)
joblib.dump(tfidf_vectorizer3, 'model/tfidfvectorizer_compressed.pkl', compress=3)

['model/tfidfvectorizer_compressed.pkl']

In [66]:
joblib.dump(pass_tf, 'model/passmodel_compressed.pkl', compress=3)

['model/passmodel_compressed.pkl']

In [None]:
from scipy.sparse import save_npz

# Save the sparse matrix
save_npz('model/tfidfvectorizer_sparse.npz', tfidf_vectorizer3)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have the true labels y_test and predicted labels pred
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')

# Print the metrics
print(f"Weighted Precision: {precision}")
print(f"Weighted Recall: {recall}")
print(f"Weighted F1 Score: {f1}")


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

Weighted Precision: 0.8682367703764733
Weighted Recall: 0.8665827819945611
Weighted F1 Score: 0.8642307298777941


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


## Most important Features

In [116]:
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names_out()
    topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]

    for coef, feat in topn:
        print (classlabel, feat, coef)



most_informative_feature_for_class(tfidf_vectorizer3, pass_tf, 'Birth Control')

Birth Control skyla 11.74
Birth Control liletta 11.89
Birth Control th grade 12.1
Birth Control zarah 12.11
Birth Control implant 12.35
Birth Control implanon 12.62
Birth Control lutera 12.73
Birth Control reclipsen 13.03
Birth Control paraguard 13.75
Birth Control nexplanon 20.3


In [None]:
most_informative_feature_for_class(tfidf_vectorizer3, pass_tf, 'Depression')

Depression bupropion 3.1758191006805907
Depression nardil 3.862534738507442
Depression brintellix 4.17057133747881
Depression parnate 4.1891617295523815
Depression deplin 4.614743467760839
Depression wellbutrin 4.844563557400706
Depression antidepressant 4.853348360743283
Depression viibryd 5.533745981348509
Depression pristiq 6.477945890345403
Depression depression 9.763257096661231


In [None]:
most_informative_feature_for_class(tfidf_vectorizer3, pass_tf, 'Anxiety')

Anxiety lexapro 3.1328920373133924
Anxiety lorazepam 3.30384115671562
Anxiety diazepam 3.530801226152978
Anxiety buspirone 3.7399243905438406
Anxiety ativan 3.75169544305828
Anxiety xanax 3.9689602200286678
Anxiety vistaril 4.954926039397752
Anxiety valium 5.33974305559199
Anxiety buspar 7.358825435456807
Anxiety anxiety 9.049563813089549


In [None]:
most_informative_feature_for_class(tfidf_vectorizer3, pass_tf, 'Pain')

Pain oxycontin 3.672655858241305
Pain dilaudid 3.728728566840897
Pain hysingla 3.8093899186641567
Pain darvocet 3.855817627346075
Pain vicodin 3.8629576889684585
Pain percocet 4.177648160894721
Pain opana 5.572840135354165
Pain pain 6.027310525551328
Pain toradol 6.204980168014917
Pain nucynta 7.686489873208824


## Sample Predictions

In [None]:
X.tail()

Unnamed: 0,condition,review,review_clean
161292,Alcohol Dependence,I wrote my first report in Mid-October of 2014...,wrote first report mid october alcohol since p...
161293,Nausea/Vomiting,I was given this in IV before surgey. I immedi...,given iv surgey immediately became anxious cou...
161294,Rheumatoid Arthritis,"Limited improvement after 4 months, developed ...",limited improvement month developed bad rash m...
161295,Underactive Thyroid,"I&#039;ve been on thyroid medication 49 years,...",thyroid medication year spent first synthroid ...
161296,"Constipation, Chronic",I&#039;ve had chronic constipation all my adul...,chronic constipation adult life tried linz wor...


In [117]:
## Function for Extracting Top drugs

def top_drugs_extractor(condition):
    df_top = df[(df['rating']>=9)&(df['usefulCount']>=100)].sort_values(by = ['rating', 'usefulCount'], ascending = [False, False])
    drug_lst = df_top[df_top['condition']==condition]['drugName'].head(3).tolist()
    return drug_lst

In [118]:
def predict_text(lst_text):
    df_test = pd.DataFrame(lst_text, columns = ['test_sent'])
    df_test["test_sent"] = df_test["test_sent"].apply(review_to_words)
    tfidf_bigram = tfidf_vectorizer3.transform(lst_text)
    prediction = pass_tf.predict(tfidf_bigram)
    df_test['prediction']=prediction
    return df_test

In [119]:
sentences = [
  "I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations.",
    "This is the third med I&#039;ve tried for anxiety and mild depression. Been on it for a week and I hate it so much. I am so dizzy, I have major diarrhea and feel worse than I started. Contacting my doc in the am and changing asap.",
    "I just got diagnosed with type 2. My doctor prescribed Invokana and metformin from the beginning. My sugars went down to normal by the second week. I am losing so much weight. No side effects yet. Miracle medicine for me",
    
  ]

In [None]:
tfidf_trigram = tfidf_vectorizer3.transform(sentences)


predictions = pass_tf.predict(tfidf_trigram)
predictions

array(['High Blood Pressure', 'Depression', 'Diabetes, Type 2'],
      dtype='<U28')

In [None]:
tfidf_trigram = tfidf_vectorizer3.transform(sentences)


predictions = pass_tf.predict(tfidf_trigram)

for text, label in zip(sentences, predictions):
    if label=="Anxiety":
        target="Anxiety"
        top_drugs = top_drugs_extractor(label)
        print("text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()
    elif label=="Depression":
        target="Depression"
        top_drugs = top_drugs_extractor(label)
        print("text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()
    elif label=="Pain":
        target="Pain"
        top_drugs = top_drugs_extractor(label)
        print("text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()
    else:
        target="Birth Control"
        print("text:", text, "\Condition:", target)
        top_drugs = top_drugs_extractor(label)
        print("text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()

text: I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations. \Condition: Birth Control
text: I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations. 
Condition: Birth Control
Top 3 Suggested Drugs:
Losartan
Aldactone
Spironolactone

text: This is th

In [None]:
df_testsent = predict_text(sentences)
df_testsent

Unnamed: 0,test_sent,prediction
0,tekturna day effect immediate also calcium cha...,High Blood Pressure
1,third med tried anxiety mild depression week h...,Depression
2,got diagnosed type doctor prescribed invokana ...,"Diabetes, Type 2"


In [None]:
import joblib
joblib.dump(tfidf_vectorizer3, 'model/tfidfvectorizer.pkl')
joblib.dump(pass_tf, 'model/passmodel.pkl')

['model/passmodel.pkl']

In [None]:
vectorizer = joblib.load('tfidfvectorizer.pkl')
model = joblib.load('passmodel.pkl')

test = model.predict(vectorizer.transform(["I&#039;ve been taking Lexapro (escitaploprgram) since February. First, I&#039;d like to mention that you can NOT take this drug for a week or less and expect to magically feel better; I felt really sick the first two weeks on this drug. But you HAVE to give the drug time. For me, I didn&#039;t really start noticing the drugs positive effects for about two months. I took Zoloft before this and felt like it made me too tired and absent-minded. Luckily, Lexapro doesn&#039;t seem to have this effect (although I do drink caffeinated drinks). I like Lexapro not only because my anxiety and depression is completely gone, but I feel like I can finally handle everything in my life now (I&#039;m a working full-time college student). I highly recommend this drug."]))
test[0]

'Depression'

In [120]:
vectorizer = joblib.load('model/tfidfvectorizer_compressed.pkl')
model = joblib.load('model/passmodel_quantized_compressed.pkl')



In [122]:
test = model.predict(vectorizer.transform(["Migraine"]))
top_drugs_extractor(test[0])

['Gabapentin', 'Gabapentin', 'Gabapentin']