In [1]:
# Importing basic libraries

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Importing libraries for sentiment analysis LSTM model

import tensorflow as tf
import keras
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, precision_score, recall_score
from keras.models import Sequential
from keras.layers import Dense,Dropout, BatchNormalization
from keras.callbacks import EarlyStopping
from keras.models import load_model


In [2]:
# Reading the dataset

data = pd.read_csv('airline_df_nlped.csv')

In [3]:
data.head()

Unnamed: 0,Country,Airline,Review,Cleaned_Review,Sentiment,Review2,Cleaned_Review2
0,China,Air China,los angeles to beijing return. food low qualit...,los angeles to beijing return . food low quali...,Negative,los angeles to beijing return. food low qualit...,los angeles to beijing return . food low quali...
1,China,Air China,round to trip from hong kong to munich. the ma...,round to trip from hong kong to munich . the m...,Negative,round to trip from hong kong to munich. the ma...,round to trip from hong kong to munich . the m...
2,China,Air China,sydney to beijing to paris then rome to beijin...,sydney to beijing to paris then rome to beijin...,Negative,sydney to beijing to paris then rome to beijin...,sydney to beijing to paris then rome to beijin...
3,China,Air China,london to sydney return via beijing. a cheap f...,london to sydney return via beijing . a cheap ...,Negative,london to sydney return via beijing. a cheap f...,london to sydney return via beijing . a cheap ...
4,China,Air China,beijing to shanghai. only one check to in desk...,beijing to shanghai . only one check to in des...,Positive,beijing to shanghai. only one check to in desk...,beijing to shanghai . only one check to in des...


In [4]:
data.shape

(15206, 7)

In [12]:
data['Review'][0]

'los angeles to beijing return. food low quality staff appeared when time for mandatory service large screen viewing on main bulkhead without sound on both trips. they were older 747s. it seems staff had something to say at irregular intervals making it hard to try and sleep but they were very good at making business transactions ignoring everyone so they could count money and look at receipts. you want a cheap very basic airline trip to china then this is the airline for you.'

In [13]:
data['Cleaned_Review'][0]

'los angeles to beijing return . food low quality staff appear when time for mandatory service large screen view on main bulkhead without sound on both trip . they be old 747 . it seem staff have something to say at irregular interval make it hard to try and sleep but they be very good at make business transaction ignore everyone so they could count money and look at receipt . you want a cheap very basic airline trip to china then this be the airline for you .'

In [5]:
data.isna().sum()

Country            0
Airline            0
Review             0
Cleaned_Review     0
Sentiment          0
Review2            0
Cleaned_Review2    0
dtype: int64

In [20]:
# Model Building

X = data['Cleaned_Review']

y = data['Sentiment']

y = y.map({'Positive':1,'Negative':0})

vec = TfidfVectorizer(max_features=10000,ngram_range=(1,1))

X = vec.fit_transform(X).toarray()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y,shuffle=True)

In [21]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

models = [('GNB',GaussianNB()),('BNB',BernoulliNB()),('MNB',MultinomialNB())]

In [22]:
results = pd.DataFrame(columns=['Model','Accuracy','Precision','Recall','F1 Score','ROC AUC Score'])

for name,model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred)
    rec = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    roc_auc = roc_auc_score(y_test,y_pred)
    # Adding the results to the dataframe without appending
    results.loc[len(results)] = [name,acc,prec,rec,f1,roc_auc]
    
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC Score
0,GNB,0.653517,0.778287,0.571268,0.6589,0.670555
1,BNB,0.829717,0.828482,0.894501,0.860227,0.816298
2,MNB,0.860947,0.840943,0.940516,0.887947,0.844464


In [None]:
# Creating a deep learning model with LSTM

model = Sequential()

model.add(Dense(128,activation='relu',input_shape=(10000,)))

model.add(Dropout(0.2))

model.add(BatchNormalization())

model.add(Dense(64,activation='relu'))

model.add(Dropout(0.2))

model.add(BatchNormalization())

model.add(Dense(32,activation='relu'))

model.add(Dropout(0.2))

model.add(BatchNormalization())

model.add(Dense(16,activation='relu'))

model.add(Dropout(0.2))

model.add(BatchNormalization())

model.add(Dense(8,activation='relu'))

model.add(Dropout(0.2))

model.add(BatchNormalization())

model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

model.summary()

In [None]:
early_stop = EarlyStopping(monitor='val_loss',patience=2,verbose=1,restore_best_weights=True)

history = model.fit(X_train,y_train,epochs=10,batch_size=32,validation_split = 0.2,callbacks=[early_stop])

In [None]:
# Plotting the loss and accuracy

import matplotlib.pyplot as plt

plt.plot(history.history['loss'],label='Train Loss')
plt.plot(history.history['val_loss'],label='Validation Loss')
plt.plot(history.history['accuracy'],label='Train Accuracy')
plt.plot(history.history['val_accuracy'],label='Validation Accuracy')
plt.legend()
plt.show()

In [None]:
# Predicting the values

y_pred = model.predict_classes(X_test)

# Evaluating the model

print('Accuracy Score: ',accuracy_score(y_test,y_pred))
print('Precision Score: ',precision_score(y_test,y_pred))
print('Recall Score: ',recall_score(y_test,y_pred))
print('F1 Score: ',f1_score(y_test,y_pred))
print('ROC AUC Score: ',roc_auc_score(y_test,y_pred))
print('Confusion Matrix: \n',confusion_matrix(y_test,y_pred))
print('Classification Report: \n',classification_report(y_test,y_pred))


In [None]:
# Making a model with a bidiirectional LSTM layer

from keras.layers import LSTM, Bidirectional

model = Sequential()

model.add(Bidirectional(LSTM(128,activation='relu',input_shape=(10000,))))

model.add(Dropout(0.2))

model.add(BatchNormalization())

model.add(Dense(64,activation='relu'))

model.add(Dropout(0.2))

model.add(BatchNormalization())

model.add(Dense(32,activation='relu'))

model.add(Dropout(0.2))

model.add(BatchNormalization())

model.add(Dense(16,activation='relu'))

model.add(Dropout(0.2))

model.add(BatchNormalization())

model.add(Dense(8,activation='relu'))

model.add(Dropout(0.2))

model.add(BatchNormalization())

model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

model.summary()