In [1]:
# Importing basic libraries

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Importing libraries for sentiment analysis LSTM model

import tensorflow as tf
import keras
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, precision_score, recall_score


In [2]:
# Reading the dataset

data = pd.read_csv('airline_df_nlped.csv')

In [3]:
data.head()

Unnamed: 0,Country,Airline,Review,Review_LS,Sentiment
0,China,Air China,los angeles beijing return. food low quality s...,los angeles beijing return . food quality staf...,0
1,China,Air China,round trip from hong kong to munich. the main ...,round trip hong kong munich . main reason fly ...,0
2,China,Air China,sydney beijing paris then rome beijing to sydn...,sydney beijing paris rome beijing sydney famil...,0
3,China,Air China,london to sydney return via beijing. a cheap f...,london sydney return beijing . cheap flight li...,0
4,China,Air China,beijing to shanghai. only one check in desk fo...,beijing shanghai . check desk standby passenge...,1


In [4]:
data.shape

(15206, 5)

In [5]:
data['Review'][0]

'los angeles beijing return. food low quality staff appeared when time for mandatory service large screen viewing on main bulkhead without sound on both trips. they were older 747s. it seems staff had something to say at irregular intervals making it hard to try and sleep but they were very good at making business transactions ignoring everyone so they could count money and look at receipts. you want a cheap very basic airline trip to china then this is the airline for you.'

In [6]:
data['Review_LS'][0]

'los angeles beijing return . food quality staff appear time mandatory service large screen view main bulkhead sound trip . 747 . staff irregular interval hard sleep business transaction ignore count money receipt . cheap basic trip china .'

In [7]:
data.isna().sum()

Country      0
Airline      0
Review       0
Review_LS    0
Sentiment    0
dtype: int64

In [9]:
# Model Building

X = data['Review']

y = data['Sentiment']

# cv = CountVectorizer(max_features=10000,ngram_range=(1,1),stop_words='english')

# X = cv.fit_transform(X).toarray()

vec = TfidfVectorizer(max_features=10000,ngram_range=(1,1),stop_words='english')

X = vec.fit_transform(X).toarray()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y,shuffle=True)

In [10]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

models = [('GNB',GaussianNB()),('BNB',BernoulliNB()),('MNB',MultinomialNB())]
results = pd.DataFrame(columns=['Model','Accuracy','Precision','Recall','F1 Score','ROC AUC'])

for name,model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred,average='weighted')
    rec = recall_score(y_test,y_pred,average='weighted')
    f1 = f1_score(y_test,y_pred,average='weighted')
    roc_auc = roc_auc_score(y_test,y_pred)
    # Adding the results to the dataframe without appending
    results.loc[len(results)] = [name,acc,prec,rec,f1,roc_auc]
    print('Model: ',name)
    print('Confusion Matrix: \n',confusion_matrix(y_test,y_pred))
    print('Classification Report: \n',classification_report(y_test,y_pred))
    
results

Model:  GNB
Confusion Matrix: 
 [[ 801  741]
 [ 288 1212]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.52      0.61      1542
           1       0.62      0.81      0.70      1500

    accuracy                           0.66      3042
   macro avg       0.68      0.66      0.66      3042
weighted avg       0.68      0.66      0.65      3042

Model:  BNB
Confusion Matrix: 
 [[1131  411]
 [ 135 1365]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.73      0.81      1542
           1       0.77      0.91      0.83      1500

    accuracy                           0.82      3042
   macro avg       0.83      0.82      0.82      3042
weighted avg       0.83      0.82      0.82      3042

Model:  MNB
Confusion Matrix: 
 [[1200  342]
 [ 152 1348]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.78      

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,GNB,0.661736,0.678854,0.661736,0.654803,0.663728
1,BNB,0.820513,0.831834,0.820513,0.819253,0.821732
2,MNB,0.837607,0.843224,0.837607,0.837112,0.838438


In [11]:
# Using a word2vec model for sentiment analysis

from gensim.models import Word2Vec

# Creating a list of reviews_for_word2vec

reviews_for_word2vec = []

for i in range(len(data)):
    reviews_for_word2vec.append(data['Review'][i].split())

In [12]:
# Creating a word2vec model

word2vec_model = Word2Vec(reviews_for_word2vec,min_count=1,vector_size=200,window=5,workers=4)

# Creating a list of vectors for each review

vectors = []

for i in range(len(reviews_for_word2vec)):
    vectors.append(np.sum(word2vec_model.wv[reviews_for_word2vec[i]],axis=0))
    
vectors = np.array(vectors)

vectors.shape

(15206, 200)

In [13]:
# Model Building

X = vectors

y = data["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True
)

from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

models = [
    ("GNB", GaussianNB()),
    ("BNB", BernoulliNB()),
    ("SVC", SVC(random_state=101)),
    ("RFC", RandomForestClassifier(random_state=101)),
    ("ETC", ExtraTreesClassifier(random_state=101, n_jobs=-1)),
    ("LR", LogisticRegression(n_jobs=-1)),
    ("KNN", KNeighborsClassifier(n_jobs=-1)),
    ("XGB", XGBClassifier(random_state=101, n_jobs=-1)),
    ("CBC", CatBoostClassifier(random_state=101, verbose=0)),
]

results = pd.DataFrame(
    columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC Score"]
)

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    # Adding the results to the dataframe without appending
    results.loc[len(results)] = [name, acc, prec, rec, f1, roc_auc]

results.set_index("Model").sort_values(by="ROC AUC Score", ascending=False)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LR,0.834977,0.815823,0.859333,0.837013,0.835309
SVC,0.8238,0.796798,0.862667,0.828425,0.824329
CBC,0.82117,0.802532,0.845333,0.823377,0.821499
XGB,0.805391,0.790281,0.824,0.806789,0.805645
RFC,0.791913,0.770768,0.822667,0.795872,0.792332
ETC,0.786654,0.763467,0.822,0.791653,0.787135
KNN,0.75904,0.726789,0.819333,0.770291,0.759861
BNB,0.738659,0.726397,0.754,0.739941,0.738868
GNB,0.618672,0.575155,0.867333,0.691653,0.622058
