##### Deep Learing ###

1) Data Cleaning

In [84]:
from logic.processing import load_data, preproc
import os
import string
import nltk


import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import models  
from tensorflow.keras.layers import Embedding, LSTM, Dense, InputLayer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Flotchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Flotchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Flotchi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/Flotchi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/Flotchi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
path = os.getcwd()

In [5]:
rpath_train = os.path.join(os.path.dirname(path),'raw_data', 'drugsComTrain_raw.csv') 
df_train = pd.read_csv(rpath_train)

In [6]:
rpath_test = os.path.join(os.path.dirname(path),'raw_data', 'drugsComTest_raw.csv') 
df_test = pd.read_csv(rpath_test)

In [7]:
def data_filter(df,top_conditions=10,uselfCount=0,min_length=30):
    
    df = df.dropna()
    condition_distribution = df['condition'].value_counts()
    top_conditions = condition_distribution.head(top_conditions)
    top_conditions_list = top_conditions.index.tolist()
    df = df[df['condition'].isin(top_conditions_list)]
    df = df[df['usefulCount']>uselfCount]
    df['review_length'] = df['review'].apply(lambda x: len(str(x).split()))
    df = df[df.review_length>=min_length]
    
    return df

In [8]:
def process(df):
    def clean(st):
        for punc in string.punctuation:
            st = st.replace(punc, '')
        ans = st.casefold().replace('\n', ' ')
        ansd = ''.join(x for x in ans if not x.isdigit())
        stop = set(stopwords.words('english'))
        tokens = word_tokenize(ansd)
        ansdd = [y for y in tokens if y not in stop]
        lemmaverb = [WordNetLemmatizer().lemmatize(word, pos='v') for word in ansdd]
        lemmanouns = [WordNetLemmatizer().lemmatize(word, pos='n') for word in lemmaverb]
        nans = ' '.join(lemmanouns)
        return nans
    df['review clean'] = df.review.apply(clean)
    return df

In [9]:
df_filter_train = data_filter(df_train)
df_filter_test = data_filter(df_test)

In [10]:
df_proc_train = process(df_filter_train)
df_proc_test = process(df_filter_test)

In [11]:
X_train = df_proc_train['review clean']
X_test = df_proc_test['review clean']
y_train = df_proc_train['condition']
y_test = df_proc_test['condition']

3) Tokenizer

In [75]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

5) Label encoding

In [60]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()


y_train_encoded = label_encoder.fit_transform(y_train) 
y_test_encoded = label_encoder.transform(y_test)

In [61]:
from tensorflow.keras.utils import to_categorical

y_train_onehot = to_categorical(y_train_encoded, num_classes=10)
y_test_onehot = to_categorical(y_test_encoded, num_classes=10)

5) Padding

In [62]:
max_len = 200

In [76]:
X_train_pad = pad_sequences(X_train_token, padding='post',maxlen=max_len)
X_test_pad = pad_sequences(X_test_token, padding='post',maxlen=max_len)

6.1) Model A

In [77]:
def initialize_model():
    model = models.Sequential()
    model.add(InputLayer(shape=(200,)))
    model.add(Embedding(10000, 128))
    model.add(LSTM(64))
    model.add(Dense(10, activation='softmax'))
  
    return model

In [78]:
model = initialize_model()

In [79]:
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [80]:
model.summary()

In [81]:
history = model.fit(X_train_pad, y_train_onehot, epochs=5, batch_size=64)

Epoch 1/5
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 156ms/step - accuracy: 0.3911 - loss: 1.9814
Epoch 2/5
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 157ms/step - accuracy: 0.3938 - loss: 1.9722
Epoch 3/5
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 143ms/step - accuracy: 0.3887 - loss: 1.9779
Epoch 4/5
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 144ms/step - accuracy: 0.3907 - loss: 1.9735
Epoch 5/5
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 145ms/step - accuracy: 0.3903 - loss: 1.9759


In [82]:
predictions = model.predict(X_test_pad)

[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 31ms/step


In [86]:
cm = confusion_matrix(y_test_onehot, predictions)

ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets

In [None]:
plt.figure(figsize=(10, 8))
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_train))
cm_display.plot(cmap='Blues', xticks_rotation=45)
plt.title('Confusion Matrix')
plt.show()