## Import libraries

In [42]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re
import numpy as np

## Reading the data

In [43]:
df=pd.read_csv('train_data_restaurant.tsv', header=None, sep='\t')
df_test=pd.read_csv('test_data_restaurant.tsv', header=None, sep='\t')

In [44]:
df.columns = ["text", "class"]
df_test.columns = ["text", "class"]

## Data preprocessing

### Removing punctuation and lower casing every letter

In [45]:
# Data preprocessing on test data
df_test['text'] = df_test['text'].apply(lambda x: x.lower())
df_test['text'] = df_test['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(df_test[ df_test['class'] == 'positive'].size)
print(df_test[ df_test['class'] == 'negative'].size)

for idx,row in df_test.iterrows():
    row[0] = row[0].replace('rt','')

240
130


In [47]:
# Data preprocessing on training data
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(df[ df['class'] == 'positive'].size)
print(df[ df['class'] == 'negative'].size)

for idx,row in df.iterrows():
    row[0] = row[0].replace('rt','')

2400
1160


### Applying tokenization on sentences

In [48]:
# Tokenization on train data
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X)
X[:1]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    4,
          10,    3,   37,    5,  643, 1251,  399,   49,   33,  849,    5,
         930,  413,  483,  524,  236, 1119,    7, 1251, 1932, 1251,   76,
        1933,  274,    9,   13,  850,    7,   55,   64,   38, 1010,   48,
          88, 1251,  237,   44,   41,  105,    9,  105,  524,  236, 1119]])

In [49]:
# Tokenization on test data
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df_test['text'].values)
X_df_test = tokenizer.texts_to_sequences(df_test['text'].values)
X_df_test = pad_sequences(X_df_test)
X_df_test[:1]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         28,  44, 115, 194, 228,  22, 787, 497,  17,   2,  49,  19, 498,
          1, 368,   6, 788, 499,  19, 369,  23,  14, 166, 789,  25, 790,
         12, 136,  41,  14,  11,   4, 370, 283, 500, 501,  92, 167, 791,
        371,  23,   2,  49, 229,  46,  20, 792, 793, 500, 501]])

## Creating model using LSTM

In [50]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = None))#X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 128)         256000    
                                                                 
 spatial_dropout1d_4 (Spatia  (None, None, 128)        0         
 lDropout1D)                                                     
                                                                 
 lstm_4 (LSTM)               (None, 196)               254800    
                                                                 
 dense_4 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [52]:
#Y = df['class'].values
Y = pd.get_dummies(df['class']).values
Y_df_test = pd.get_dummies(df_test['class']).values

print(X.shape,Y.shape)
print(X_df_test.shape,Y_df_test.shape)

(1780, 77) (1780, 2)
(185, 76) (185, 2)


In [54]:
batch_size = 128
model.fit(X, Y, epochs = 15, batch_size=batch_size, verbose = 1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x128cbe1a670>

### Model evaluation

In [55]:
Y_pred = model.predict(X_df_test,batch_size = batch_size)
classes_x=np.argmax(Y_pred,axis=1)

In [57]:
results = pd.DataFrame({'true': Y_df_test.tolist(), 'pred':classes_x})
results['true'] = results['true'].apply(lambda x: np.argmax(x))
print("confusion matrix",confusion_matrix(results.true, results.pred))
print(classification_report(results.true, results.pred))

confusion matrix [[ 17  48]
 [ 19 101]]
              precision    recall  f1-score   support

           0       0.47      0.26      0.34        65
           1       0.68      0.84      0.75       120

    accuracy                           0.64       185
   macro avg       0.58      0.55      0.54       185
weighted avg       0.61      0.64      0.61       185



## Model summary and notes for furthur improvements

- What kind of preprocessing is done and the reason behind it
    There are 3 main preprocessing techniques that I have used
        - Punctuation removal: Punctuation marks such as commas, colons, and double quotes also have no effect on finding the context of the sentence for the model. Therefore, punctuation marks will be removed.
        - Lower case sentences: Uppercase and lowercase letters only help in structuring the sentences for humans, but for the model uppercase and lowercase words can be seen differently by the model but they have the same meaning so they can be grouped as the same word.
        Tokenization: The tokenization process divides words into tokens, which are then used as input in the normalization and cleaning processes. It can also be used to turn text into numerical form, which machine learning models can understand.
        
- Methods used to solve the problem and the reason behind it: 
    There are various multiple reaserch that indicates that LSTM models out-performs any other deep learning model when it comes to text classification. Although recently there have been research which started applying hybrid model such as mixing CNN+LSTM for better performance.

- Performance metrics used to test the model and the reason behind it: 
    I created a confusion matrix table which has all of the important metrics which can be used to evaluate every aspect of the category of the class that have predicted by the model. In this model, the model predicts 'positive' class much better as indicated by the precision and recall metrics. 
    
- Model performance analysis: 
    The model will most of the time predict the class 'positive' and there is a high chance that most of the negative sentiment will be classified as positive as shown by the confusion matrix and the metrics avialable within the confsuion matrix. The reason the model is acting this way is because the dataset is imblanced, in order to fix it, we need to apply downsampling to balance the data.

In [58]:
model.save("assessment_model")

INFO:tensorflow:Assets written to: assessment_model\assets


