In [10]:
# Importing basic libraries

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Importing libraries for sentiment analysis LSTM model

import tensorflow as tf
import keras
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, precision_score, recall_score


In [11]:
# Reading the dataset

data = pd.read_csv('airline_df_nlped.csv')

In [12]:
data.head()

Unnamed: 0,Country,Airline,Review,Cleaned_Review,Sentiment,Review2,Cleaned_Review2
0,China,Air China,los angeles beijing return food low quality st...,los angeles beijing return food quality staff ...,Negative,los angeles beijing return food low quality st...,los angeles beijing return food quality staff ...
1,China,Air China,round trip from hong kong to munich the main r...,round trip hong kong munich main reason fly ai...,Negative,round trip from hong kong to munich the main r...,round trip hong kong munich main reason fly ai...
2,China,Air China,sydney beijing paris then rome beijing to sydn...,sydney beijing paris rome beijing sydney famil...,Negative,sydney beijing paris then rome beijing to sydn...,sydney beijing paris rome beijing sydney famil...
3,China,Air China,london to sydney return via beijing a cheap fl...,london sydney return beijing cheap flight live...,Negative,london to sydney return via beijing a cheap fl...,london sydney return beijing cheap flight live...
4,China,Air China,beijing to shanghai only one check in desk for...,beijing shanghai check desk standby passenger ...,Positive,beijing to shanghai only one check in desk for...,beijing shanghai check desk standby passenger ...


In [4]:
data.shape

(15206, 6)

In [5]:
data['Review2'][0]

'los angeles beijing return food low quality staff appear when time for mandatory service large screen view on main bulkhead without sound on both trip they be old it seem staff have something to say at irregular interval make it hard to try and sleep but they be very good at make business transaction ignore everyone so they could count money and look at receipt you want a cheap very basic airline trip to china then this be the airline for you'

In [6]:
data['Cleaned_Review2'][0]

'los angeles beijing return food quality staff appear time mandatory service large screen view main bulkhead sound trip staff irregular interval hard sleep business transaction ignore count money receipt cheap basic trip china'

In [7]:
data.isna().sum()

Country            0
Airline            0
Review             0
Cleaned_Review     0
Review2            0
Cleaned_Review2    0
dtype: int64

In [15]:
# Model Building

X = data['Review2']

y = data['Sentiment']

y = y.map({'Positive':1,'Negative':0,})

# cv = CountVectorizer(max_features=10000,ngram_range=(1,1),stop_words='english')

# X = cv.fit_transform(X).toarray()

vec = TfidfVectorizer(max_features=10000,ngram_range=(1,1),stop_words='english')

X = vec.fit_transform(X).toarray()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y,shuffle=True)

In [16]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

models = [('GNB',GaussianNB()),('BNB',BernoulliNB()),('MNB',MultinomialNB())]
results = pd.DataFrame(columns=['Model','Accuracy','Precision','Recall','F1 Score'])

for name,model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred,average='weighted')
    rec = recall_score(y_test,y_pred,average='weighted')
    f1 = f1_score(y_test,y_pred,average='weighted')
    # roc_auc = roc_auc_score(y_test,y_pred)
    # Adding the results to the dataframe without appending
    results.loc[len(results)] = [name,acc,prec,rec,f1]
    print('Model: ',name)
    print('Confusion Matrix: \n',confusion_matrix(y_test,y_pred))
    print('Classification Report: \n',classification_report(y_test,y_pred))
    
results

Model:  GNB
Confusion Matrix: 
 [[960 300]
 [786 996]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.55      0.76      0.64      1260
           1       0.77      0.56      0.65      1782

    accuracy                           0.64      3042
   macro avg       0.66      0.66      0.64      3042
weighted avg       0.68      0.64      0.64      3042

Model:  BNB
Confusion Matrix: 
 [[ 994  266]
 [ 154 1628]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.79      0.83      1260
           1       0.86      0.91      0.89      1782

    accuracy                           0.86      3042
   macro avg       0.86      0.85      0.86      3042
weighted avg       0.86      0.86      0.86      3042

Model:  MNB
Confusion Matrix: 
 [[ 935  325]
 [ 100 1682]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.74      0.81

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,GNB,0.642998,0.677937,0.642998,0.643673
1,BNB,0.861933,0.862165,0.861933,0.860825
2,MNB,0.860289,0.86512,0.860289,0.857589


In [17]:
# Using a word2vec model for sentiment analysis

from gensim.models import Word2Vec

# Creating a list of reviews_for_word2vec

reviews_for_word2vec = []

for i in range(len(data)):
    reviews_for_word2vec.append(data['Review2'][i].split())

In [20]:
# Creating a word2vec model

word2vec_model = Word2Vec(reviews_for_word2vec,min_count=1,vector_size=200,window=5,workers=4)

# Creating a list of vectors for each review

vectors = []

for i in range(len(reviews_for_word2vec)):
    vectors.append(np.sum(word2vec_model.wv[reviews_for_word2vec[i]],axis=0))
    
vectors = np.array(vectors)

vectors.shape

(15206, 200)

In [21]:
# Model Building

X = vectors

y = data["Sentiment"]

y = y.map({"Positive": 1, "Negative": 0})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True
)

from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

models = [
    ("GNB", GaussianNB()),
    ("BNB", BernoulliNB()),
    ("SVC", SVC(random_state=101)),
    ("RFC", RandomForestClassifier(random_state=101)),
    ("ETC", ExtraTreesClassifier(random_state=101, n_jobs=-1)),
    ("LR", LogisticRegression(n_jobs=-1)),
    ("KNN", KNeighborsClassifier(n_jobs=-1)),
    ("XGB", XGBClassifier(random_state=101, n_jobs=-1)),
    ("CBC", CatBoostClassifier(random_state=101, verbose=0)),
]

results = pd.DataFrame(
    columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC Score"]
)

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    # Adding the results to the dataframe without appending
    results.loc[len(results)] = [name, acc, prec, rec, f1, roc_auc]

results.set_index("Model").sort_values(by="ROC AUC Score", ascending=False)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LR,0.870809,0.882645,0.89899,0.890742,0.864971
SVC,0.865549,0.876164,0.897306,0.886609,0.858971
CBC,0.862919,0.873155,0.896184,0.88452,0.856029
XGB,0.854043,0.868392,0.884961,0.876598,0.847639
RFC,0.836292,0.847403,0.878788,0.86281,0.827489
ETC,0.837607,0.840381,0.892256,0.865542,0.826287
KNN,0.819198,0.82319,0.880471,0.850868,0.806506
BNB,0.793557,0.84223,0.796857,0.818916,0.792873
GNB,0.693294,0.686922,0.875421,0.7698,0.655568
