# Exploratory Data Analysis

## Import libraries

In [1]:
import pandas as pd

d:\apps\python 3.7.2\lib\site-packages\numpy\.libs\libopenblas.IPBC74C7KURV7CB2PKT5Z5FNR3SIBV4J.gfortran-win_amd64.dll
d:\apps\python 3.7.2\lib\site-packages\numpy\.libs\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll
  stacklevel=1)


### Load data

In [2]:
df = pd.read_csv('Train.csv', sep=';')
df.head()

Unnamed: 0,name,condition,opinion,rate,rate1
0,Zegerid,GERD,"""Using it as a replacement for Nexium, since i...",10,high
1,Ethosuximide,Seizures,"""This medicine is very good at controlling me ...",10,high
2,Tri-Sprintec,Birth Control,"""I just started taking Tri Sprintec after my l...",9,high
3,Levaquin,Pneumonia,"""This medicine made me feel absolutely horribl...",5,medium
4,Methylphenidate,ADHD,"""I&#039;ve been taking Concerta since 2003. Fo...",9,high


In [3]:
no_examples = len(df)
no_examples
no_classes_1 = 10
no_classes_2 = 3

In [None]:
df['opinion'].str.len().mean()

In [None]:
df['opinion'].str.len().max()

In [None]:
df['opinion'].str.len().min()


In [None]:
df['opinion'].str.len().hist(bins=200)

In [None]:
len(df[df['opinion'].str.len() < 1000])

In [None]:
df[df['opinion'].str.len() < 1000]['opinion'].str.len().hist(bins=200)

In [None]:
df.groupby(by='rate').name.count()

In [None]:
df.plot.hist(by='rate')


In [4]:
df['op_len'] = df['opinion'].str.len()

In [None]:
print(df.corr())
print(df[df['op_len'] < 1000 ].corr())

In [None]:
df[df['op_len'] < 1000 ].corr().style.background_gradient(cmap='coolwarm')


# Data cleaning

In [5]:
from html import unescape

Remove html escaping

In [6]:
df.update(df[df['opinion'].str.contains('&')]['opinion'].apply(unescape))
df.opinion.head()

0    "Using it as a replacement for Nexium, since i...
1    "This medicine is very good at controlling me ...
2    "I just started taking Tri Sprintec after my l...
3    "This medicine made me feel absolutely horribl...
4    "I've been taking Concerta since 2003. For me ...
Name: opinion, dtype: object

remove trailing "

In [7]:
df['opinion'] = df['opinion'].str[1:-1]
df.opinion.head()

0    Using it as a replacement for Nexium, since in...
1    This medicine is very good at controlling me s...
2    I just started taking Tri Sprintec after my la...
3    This medicine made me feel absolutely horrible...
4    I've been taking Concerta since 2003. For me i...
Name: opinion, dtype: object

In [8]:
df[df['opinion'].str.len() < 2]


Unnamed: 0,name,condition,opinion,rate,rate1,op_len
37629,Linagliptin / metformin,min),G,9,high,3
73143,Keppra,Neuralgia,I,9,high,3
95279,Ifex,Testicular Cance,-,10,high,3
99512,Ifosfamide,Testicular Cance,-,10,high,3
136845,Clonazepam,Anxiety,I,8,high,3
137422,Levetiracetam,Neuralgia,I,9,high,3


In [9]:
df = df[df['opinion'].str.len() > 2]

In [10]:
df[df['opinion'].str.contains('\n')].count()

name         16731
condition    16660
opinion      16731
rate         16731
rate1        16731
op_len       16731
dtype: int64

# N-Grams & logistic regression

In [None]:
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


In [None]:
train, test = train_test_split(df, test_size=0.2)

In [None]:
print(train.shape, test.shape)

In [None]:
tfidf_vect = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X_train_tfidf = tfidf_vect.fit_transform(train['opinion'])
len(tfidf_vect.vocabulary_)


In [None]:
indices = np.argsort(tfidf_vect.idf_)[::-1]
features = tfidf_vect.get_feature_names()
[features[i] for i in indices[:100]]

In [None]:
lr = LogisticRegression(solver='saga',        
                        multi_class='multinomial',
                        penalty='l1',
                        random_state=42)
lr.fit(X_train_tfidf, train['rate'])

In [None]:
X_test_tfidf = tfidf_vect.transform(test['opinion'])

In [None]:
predicted = lr.predict(X_test_tfidf)

accuracy = np.sum(predicted == test['rate']) / predicted.shape[0]
print(accuracy)

In [None]:
error = np.sum(np.abs(predicted - test['rate'])) / predicted.shape[0]
print(error)

In [None]:
pd.DataFrame(data=np.abs(predicted - test['rate'])).plot.hist()

Get drug and disease names for stop words

In [None]:
drug_names = df['name'].str.lower().unique()
drug_names

In [None]:
disease_names = df['condition'].str.lower().dropna()

disease_names = disease_names[~disease_names.str.contains('</span>')].unique()

In [None]:
stop_words = list(disease_names) + list(drug_names)
stop_words

In [None]:
tfidf_vect = TfidfVectorizer(ngram_range=(1,2), max_features=15000, sublinear_tf=True)
X_train_tfidf = tfidf_vect.fit_transform(train['opinion'])
len(tfidf_vect.vocabulary_)


In [None]:
lr = LogisticRegression(solver='saga',        
                        multi_class='multinomial',
                        penalty='l1',
                        random_state=42)
lr.fit(X_train_tfidf, train['rate'])

In [None]:
X_test_tfidf = tfidf_vect.transform(test['opinion'])

In [None]:
predicted = lr.predict(X_test_tfidf)

accuracy = np.sum(predicted == test['rate']) / predicted.shape[0]
print(accuracy)

In [None]:
error = np.sum(np.abs(predicted - test['rate'])) / predicted.shape[0]
print(error)

In [None]:
indices = np.argsort(tfidf_vect.idf_)[::-1]
features = tfidf_vect.get_feature_names()
[(features[i], tfidf_vect.idf_[i]) for i in indices[13000:13100]]

Examples of BIG mistakes

In [None]:
def f(x):
    if x > 7:
        return 'high'
    if x > 3:
        return 'medium'
    return 'low'

accuracy1 = np.sum(np.array([f(p) for p in predicted]) == test['rate1']) / predicted.shape[0]
print(accuracy1)
for p, r , r2 in zip(predicted, test['opinion'], test['rate']):
    if abs(p-r2) > 6:
        print('opinion=', r, 'predicted=', p, 'rate=',r2)

# LSTM Network

In [23]:
from sklearn.model_selection import train_test_split
import keras
from keras import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import Adam
from keras.utils import Sequence
import numpy as np
import nltk
import re
from nltk.corpus import stopwords

In [12]:
# Clean the data up
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
UTF_CODES = re.compile('#\d+')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = UTF_CODES.sub('', text)
    # We do not remove stop words for LSTM because having a 'not' in a sentence is actually kind of useful
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [13]:
no = 10
print(df['opinion'][no])
print('\n\n')
df['opinion'] = df['opinion'].map(clean_text)
print(df['opinion'][no])
print(f"rate = {df['rate'][no]}")

This drug came after a bad reaction to Metformin.  It is definitely not any better.  From day one I have had diarrhea.  And the heaviness in my arms is still present.  Now the leg spasms are showing up but it is the abdominal pain and the excruciating back pain that take this drug out of the running.  I am coming off of it.  I can't afford for this drug to kill me.  Nausea has been a problem within 3 days of beginning it. Be wary. . .be alert.



this drug came after a bad reaction to metformin  it is definitely not any better  from day one i have had diarrhea  and the heaviness in my arms is still present  now the leg spasms are showing up but it is the abdominal pain and the excruciating back pain that take this drug out of the running  i am coming off of it  i cant afford for this drug to kill me  nausea has been a problem within 3 days of beginning it be wary  be alert
rate = 1


In [14]:
##########################################################################
# vectorize

characters = set()
for opinion in df['opinion']:
    characters.update(opinion)
    
    
def encode_x(x):
    ret = np.zeros((len(x), len(characters)))
    for i, c in enumerate(x):
        ret[i, characters[c]] = 1
    return ret


characters = dict(zip(characters, range(len(characters))))
X = df['opinion'].map(encode_x).to_numpy()
Y = df['rate'].to_numpy()

# One hot encoding
labels = np.zeros(Y.shape[0], no_classes_1)
for i, elem in enumerate(Y):
    labels[i][elem-1] = 1
Y = labels


TypeError: 'numpy.float64' object does not support item assignment

In [19]:
# Define network
def create_model():
    model = Sequential()
    model.add(LSTM(64, input_shape=(None, len(characters))))
    model.add(Dense(10, activation='softmax'))
    
    optimizer = Adam(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    return model

model = create_model()

W0516 14:59:57.930961 12072 deprecation_wrapper.py:119] From d:\apps\python 3.7.2\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0516 15:00:00.662924 12072 deprecation_wrapper.py:119] From d:\apps\python 3.7.2\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0516 15:00:01.071328 12072 deprecation_wrapper.py:119] From d:\apps\python 3.7.2\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0516 15:00:03.198310 12072 deprecation_wrapper.py:119] From d:\apps\python 3.7.2\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0516 15:00:03.208348 12072 deprecation_wrapper.py:119] From d:\apps\python 3.7.2\lib\site-packa

In [20]:
x_train, x_validation, y_train, y_validation =\
    train_test_split(X, Y, test_size=0.1, random_state=1)

(149990,) (149990, 10)


In [None]:
class CustomBatchGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, X, y, batch_size=1, shuffle=True):
        'Initialization'
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.y)/self.batch_size))

    def __getitem__(self, index):
        return self.__data_generation(index)

    def on_epoch_end(self):
        'Shuffles indexes after each epoch'
        self.indexes = np.arange(len(self.y))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, index):
        Xb = np.empty((self.batch_size,) + self.X[index].shape)
        yb = np.empty((self.batch_size,) + self.y[index].shape)
        # naively use the same sample over and over again
        for s in range(0, self.batch_size):
            Xb[s] = self.X[index]
            yb[s] = self.y[index]
        return Xb, yb

model.fit_generator(CustomBatchGenerator(x_train, y_train, batch_size=1), epochs=30)

Epoch 1/30


In [151]:
x_train[0]

array([24, 38, 10, 25, 25, 23, 38, 10, 18, 24, 14, 38,  6, 24, 32, 32, 38,
        1, 25, 37, 38,  4, 38, 31, 27, 11, 37, 14, 38, 11,  3, 35, 38, 24,
        3, 38, 10, 18, 25, 14, 27, 38, 10, 18, 37, 27, 27, 38, 31, 27, 11,
       37, 14, 38, 24, 38, 10, 18, 25,  0,  5, 18, 10, 38,  3, 25, 10, 18,
       24,  3,  5, 38, 20, 11, 14, 38, 20, 37, 25,  3,  5, 38,  7,  0, 10,
       38, 24, 38, 20, 11, 14, 38, 37, 27, 11, 32, 32, 31, 38, 27, 22, 25,
       10, 24, 25,  3, 11, 32, 38, 11,  3, 35, 38, 24, 38,  5, 25, 10, 38,
       35, 27,  6, 37, 27, 14, 14, 27, 35, 38,  7, 27,  1, 25, 37, 27, 38,
       22, 31, 38,  6, 27, 37, 24, 25, 35, 14, 38, 24, 38, 10, 18, 25,  0,
        5, 18, 10, 38, 24, 38, 18, 11, 35, 38, 19,  0, 14, 10, 38, 16, 18,
       11,  3,  5, 27, 35, 38,  7,  0, 10, 38, 11, 14, 38, 14, 25, 25,  3,
       38, 11, 14, 38, 24, 38, 20, 27,  3, 10, 38, 25,  1,  1, 38, 10, 18,
       24, 14, 38,  6, 24, 32, 32, 38, 11, 32, 32, 38, 10, 18, 25, 14, 27,
       38, 10, 18, 24,  3

In [152]:
df['opinion'].map(len).max()


10145

(array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]),
 array([10, 10,  9], dtype=int64),
 (149990, 10))