In [20]:
# Importing basic libraries

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Importing libraries for sentiment analysis LSTM model

import tensorflow as tf
import keras
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, precision_score, recall_score


In [21]:
# Reading the dataset

data = pd.read_csv('airline_df_nlped.csv')

In [22]:
data.head()

Unnamed: 0,Country,Airline,Review,Cleaned_Review,Review2,Cleaned_Review2,sentiment
0,China,Air China,los angeles beijing return food low quality st...,los angeles beijing return food quality staff ...,los angeles beijing return food low quality st...,los angeles beijing return food quality staff ...,positive
1,China,Air China,round trip from hong kong to munich the main r...,round trip hong kong munich main reason fly ai...,round trip from hong kong to munich the main r...,round trip hong kong munich main reason fly ai...,negative
2,China,Air China,sydney beijing paris then rome beijing to sydn...,sydney beijing paris rome beijing sydney famil...,sydney beijing paris then rome beijing to sydn...,sydney beijing paris rome beijing sydney famil...,negative
3,China,Air China,london to sydney return via beijing a cheap fl...,london sydney return beijing cheap flight live...,london to sydney return via beijing a cheap fl...,london sydney return beijing cheap flight live...,positive
4,China,Air China,beijing to shanghai only one check in desk for...,beijing shanghai check desk standby passenger ...,beijing to shanghai only one check in desk for...,beijing shanghai check desk standby passenger ...,negative


In [23]:
data.shape

(15206, 7)

In [24]:
data['Review2'][0]

'los angeles beijing return food low quality staff appear when time for mandatory service large screen view on main bulkhead without sound on both trip they be old it seem staff have something to say at irregular interval make it hard to try and sleep but they be very good at make business transaction ignore everyone so they could count money and look at receipt you want a cheap very basic airline trip to china then this be the airline for you'

In [25]:
data['Cleaned_Review2'][0]

'los angeles beijing return food quality staff appear time mandatory service large screen view main bulkhead sound trip staff irregular interval hard sleep business transaction ignore count money receipt cheap basic trip china'

In [26]:
data.isna().sum()

Country            0
Airline            0
Review             0
Cleaned_Review     0
Review2            0
Cleaned_Review2    0
sentiment          0
dtype: int64

In [34]:
# Model Building

X = data['Review2']

y = data['sentiment']

y = y.map({'positive':1,'negative':0,'neutral':2})

cv = CountVectorizer(max_features=10000,ngram_range=(1,1),stop_words='english')

X = cv.fit_transform(X).toarray()

# vec = TfidfVectorizer(max_features=10000,ngram_range=(1,2),stop_words='english')

# X = vec.fit_transform(X).toarray()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y,shuffle=True)

# Creating a Word2Vec model for model building

from gensim.models import Word2Vec

sentences = [row.split() for row in data['Review2']]

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

model.wv.most_similar('lunch')

In [36]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

models = [('GNB',GaussianNB()),('BNB',BernoulliNB()),('MNB',MultinomialNB())]
results = pd.DataFrame(columns=['Model','Accuracy','Precision','Recall','F1 Score'])

for name,model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred,average='weighted')
    rec = recall_score(y_test,y_pred,average='weighted')
    f1 = f1_score(y_test,y_pred,average='weighted')
    # roc_auc = roc_auc_score(y_test,y_pred)
    # Adding the results to the dataframe without appending
    results.loc[len(results)] = [name,acc,prec,rec,f1]
    print('Model: ',name)
    print('Confusion Matrix: \n',confusion_matrix(y_test,y_pred))
    print('Classification Report: \n',classification_report(y_test,y_pred))
    
results

Model:  GNB
Confusion Matrix: 
 [[ 773  224   18]
 [1213  726   51]
 [  31    5    1]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.38      0.76      0.51      1015
           1       0.76      0.36      0.49      1990
           2       0.01      0.03      0.02        37

    accuracy                           0.49      3042
   macro avg       0.39      0.38      0.34      3042
weighted avg       0.63      0.49      0.49      3042

Model:  BNB
Confusion Matrix: 
 [[ 769  242    4]
 [ 251 1728   11]
 [  17   20    0]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.76      0.75      1015
           1       0.87      0.87      0.87      1990
           2       0.00      0.00      0.00        37

    accuracy                           0.82      3042
   macro avg       0.54      0.54      0.54      3042
weighted avg       0.82      0.82      0.82      3042

Model:  MNB
Co

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,GNB,0.493097,0.625357,0.493097,0.492894
1,BNB,0.820842,0.815479,0.820842,0.818131
2,MNB,0.823143,0.81578,0.823143,0.819431


In [10]:
# Using a word2vec model for sentiment analysis

from gensim.models import Word2Vec

# Creating a list of reviews_for_word2vec

reviews_for_word2vec = []

for i in range(len(data)):
    reviews_for_word2vec.append(data['Review2'][i].split())

In [11]:
# Creating a word2vec model

word2vec_model = Word2Vec(reviews_for_word2vec,min_count=1,vector_size=100,window=5,workers=4)

# Creating a list of vectors for each review

vectors = []

for i in range(len(reviews_for_word2vec)):
    vectors.append(np.sum(word2vec_model.wv[reviews_for_word2vec[i]],axis=0))
    
vectors = np.array(vectors)

vectors.shape

(15206, 100)

In [12]:
# Model Building

X = vectors

y = data["Sentiment"]

y = y.map({"Positive": 1, "Negative": 0})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True
)

from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

models = [
    ("GNB", GaussianNB()),
    ("BNB", BernoulliNB()),
    ("SVC", SVC(random_state=101)),
    ("RFC", RandomForestClassifier(random_state=101)),
    ("ETC", ExtraTreesClassifier(random_state=101, n_jobs=-1)),
    ("LR", LogisticRegression(n_jobs=-1)),
    ("KNN", KNeighborsClassifier(n_jobs=-1)),
    ("XGB", XGBClassifier(random_state=101, n_jobs=-1)),
    ("CBC", CatBoostClassifier(random_state=101, verbose=0)),
]

results = pd.DataFrame(
    columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC Score"]
)

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    # Adding the results to the dataframe without appending
    results.loc[len(results)] = [name, acc, prec, rec, f1, roc_auc]

results.set_index("Model").sort_values(by="ROC AUC Score", ascending=False)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LR,0.870809,0.882223,0.899551,0.890803,0.864855
SVC,0.864563,0.872688,0.900112,0.886188,0.857199
CBC,0.853715,0.86832,0.8844,0.876286,0.847359
XGB,0.836621,0.857143,0.86532,0.861212,0.830676
RFC,0.832676,0.838384,0.884961,0.861043,0.821845
ETC,0.829717,0.833685,0.886083,0.859086,0.818042
KNN,0.8238,0.825496,0.886644,0.854978,0.810782
BNB,0.782709,0.817924,0.809203,0.81354,0.777221
GNB,0.681131,0.676062,0.87486,0.76272,0.641001
