## Sentiment Analysis. 

## EDA and Feature Engineering

In [105]:
## Importing the necessary libraries
import pandas as pd
import nltk 
import numpy as np

In [106]:
## Loading the data

data = pd.read_csv('C:/Internship/train.txt',sep = ';',header=None)
data_val = pd.read_csv('C:/Internship/val.txt',sep = ';',header=None)
data_test = pd.read_csv('C:/Internship/test.txt',sep = ';',header=None)

In [107]:
## Index number for unique id

data['TextId'] = data.index
data_val['TextId'] = data_val.index
data_test['TextId'] = data_val.index

## Defining the columns

data.columns = ['Text','Mood','TextID']
data_val.columns = ['Text','Mood','TextID']
data_test.columns = ['Text','Mood','TextID']

In [108]:
## Maping the values to o and 1. Where 0: Negative, 1:Positive
data['Mood'].unique()
g = {'sadness':0,'anger':0,'love':1,'surprise':1,'fear':0,'joy':1}

data['Sentiment'] = data['Mood'].map(g)
data_val['Sentiment'] = data_val['Mood'].map(g)
data_test['Sentiment'] = data_test['Mood'].map(g)

In [109]:
y = data['Sentiment']
y_val = data_val['Sentiment']
y_test = data_test['Sentiment']

In [110]:
## Data cleaning round 1 

import re
import string

def clean_text_round1(text):
    ## Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [111]:
# Let's take a look at the updated text
data['Text_selected'] = pd.DataFrame(data.Text.apply(round1))
data.head()

Unnamed: 0,Text,Mood,TextID,Sentiment,Text_selected
0,i didnt feel humiliated,sadness,0,0,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,1,0,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong,anger,2,0,im grabbing a minute to post i feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,3,1,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy,anger,4,0,i am feeling grouchy


In [112]:
# Apply a second round of cleaning
def clean_text_round2(text):
    ###Get rid of some additional punctuation and non-sensical text that was missed the first time around.
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [113]:
data['Text_selected'] = pd.DataFrame(data.Text.apply(round2))
data.head()

Unnamed: 0,Text,Mood,TextID,Sentiment,Text_selected
0,i didnt feel humiliated,sadness,0,0,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,1,0,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong,anger,2,0,im grabbing a minute to post i feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,3,1,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy,anger,4,0,i am feeling grouchy


In [114]:
### Applying the same for validation dataset.

data_val['Text_selected'] = pd.DataFrame(data_val.Text.apply(round1))
data_val['Text_selected'] = pd.DataFrame(data_val.Text.apply(round2))

In [115]:
## Applying the same for test dataset. 

data_test['Text_selected'] = pd.DataFrame(data_test.Text.apply(round1))
data_test['Text_selected'] = pd.DataFrame(data_test.Text.apply(round2))

In [116]:


data.to_csv('train.csv')
data_val.to_csv('val.csv')
data_test.to_csv('test.csv')

## Now the text is cleaned. We will build a bidirectional LSTM model with word embedding. (document-matrix)

In [117]:
## Creating the model 
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.models import Sequential 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM,Bidirectional
from tensorflow.keras.layers import Dropout


In [118]:
## Defining the vocabulary size. 
voc_size=5000
## Embedding Representation for training set
onehot_repr=[one_hot(words,voc_size)for words in data['Text_selected']] 

max_len = 20
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=max_len)

In [119]:
## Embedding Representation for validation set

onehot_repr_val=[one_hot(words,voc_size)for words in data_val['Text_selected']] 

embedded_docs_val = pad_sequences(onehot_repr_val,padding='pre',maxlen=max_len)

## Embedding Representation for test set

onehot_repr_test=[one_hot(words,voc_size)for words in data_test['Text_selected']] 

embedded_docs_test = pad_sequences(onehot_repr_test,padding='pre',maxlen=max_len)

## Building the model

In [120]:
embedding_vector_size = 40
model = Sequential()
model.add(Embedding(voc_size,embedding_vector_size,input_length =max_len))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.3))        
model.add(Dense(units=1,activation='sigmoid'))
model.compile('Adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()



X_train = np.array(embedded_docs)
y_train = np.array(y)

X_val =  np.array(embedded_docs_val)
y_val = np.array(y_val)

X_test = np.array(embedded_docs_test)
y_test = np.array(y_test)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 200)               112800    
_________________________________________________________________
dropout_4 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________


## Training the model

In [121]:
model.fit(X_train,y_train,batch_size=64,epochs=10,validation_data=(X_val,y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1eddd0fb700>

## Evaluating the model on validation set. 

In [124]:
y_pred_val=model.predict_classes(X_val)

from sklearn.metrics import accuracy_score,confusion_matrix, classification_report

score = accuracy_score(y_pred_val,y_val)
matrix = confusion_matrix(y_pred_val,y_val)
cls_report = classification_report(y_pred_val,y_val)



In [125]:
print(score)
print(matrix)
print(cls_report)

0.8895
[[938 122]
 [ 99 841]]
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      1060
           1       0.87      0.89      0.88       940

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000



## Evaluating the model on test set. 

In [126]:
y_pred_test=model.predict_classes(X_test)

score_test = accuracy_score(y_pred_test,y_test)
matrix_test = confusion_matrix(y_pred_test,y_test)
cls_report_test = classification_report(y_pred_test,y_test)



In [127]:
print(score_test)
print(matrix_test)
print(cls_report_test)

0.8765
[[954 121]
 [126 799]]
              precision    recall  f1-score   support

           0       0.88      0.89      0.89      1075
           1       0.87      0.86      0.87       925

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.88      0.88      0.88      2000



In [128]:
y_pred = model.predict_classes(X_train)



In [129]:
## concating the predictions

data = pd.concat([data,pd.DataFrame(y_pred)],axis=1)
data.rename(columns={0:'Predictions'},inplace=True)

In [132]:
data_val = pd.concat([data_val,pd.DataFrame(y_pred_val)],axis=1)
data_val.rename(columns={0:'Predictions'},inplace=True)

In [133]:
data_test = pd.concat([data_test,pd.DataFrame(y_pred_test)],axis=1)
data_test.rename(columns={0:'Predictions'},inplace=True)

In [135]:
## Remaping to the final dataset. 

d = {0:'Negative',1:'Positive'}

data['Sentiment'] = data['Sentiment'].map(d)
data['Predictions'] = data['Predictions'].map(d)

data_val['Sentiment'] = data_val['Sentiment'].map(d)
data_val['Predictions'] = data_val['Predictions'].map(d)

data_test['Sentiment'] = data_test['Sentiment'].map(d)
data_test['Predictions'] = data_test['Predictions'].map(d)

In [138]:
## Converting into csv. 

data.to_csv('submission_train.csv')
data_val.to_csv('submission_val.csv')
data_test.to_csv('submission_test.csv')