In [1]:
# Importing basic libraries

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Importing libraries for sentiment analysis

import tensorflow as tf
import keras
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, precision_score, recall_score


In [2]:
# Reading the dataset

data = pd.read_csv('airline_nlped.csv')

In [3]:
data.head()

Unnamed: 0,Country,Airline,Review,Cleaned_Review,Sentiment
0,China,Air China,los angeles beijing return. food low quality s...,los angeles beijing return . food quality staf...,0
1,China,Air China,round trip from hong kong to munich. the main ...,round trip hong kong munich . main reason fly ...,0
2,China,Air China,sydney beijing paris then rome beijing to sydn...,sydney beijing paris rome beijing sydney famil...,0
3,China,Air China,beijing to shanghai. only one check in desk fo...,beijing shanghai . check desk standby passenge...,1
4,China,Air China,stockholm bangkok via beijing. return flight b...,stockholm bangkok beijing . return bangkok fra...,0


In [4]:
data.shape

(14629, 5)

In [5]:
data['Review'][0]

'los angeles beijing return. food low quality staff appeared when time for mandatory service large screen viewing on main bulkhead without sound on both trips. they were older 747s. it seems staff had something to say at irregular intervals making it hard to try and sleep but they were very good at making business transactions ignoring everyone so they could count money and look at receipts. you want a cheap very basic airline trip to china then this is the airline for you.'

In [7]:
data['Cleaned_Review'][0]

'los angeles beijing return . food quality staff appear time mandatory service large screen view main bulkhead sound trip . 747 . staff irregular interval hard sleep business transaction ignore count money receipt . cheap basic trip china .'

In [8]:
data.isna().sum()

Country           0
Airline           0
Review            0
Cleaned_Review    0
Sentiment         0
dtype: int64

In [21]:
# Model Building

X = data['Review']

y = data['Sentiment']

# cv = CountVectorizer(max_features=10000,ngram_range=(1,1),stop_words='english')

# X = cv.fit_transform(X).toarray()

vec = TfidfVectorizer(max_features=10000,ngram_range=(1,3),stop_words='english')

X = vec.fit_transform(X).toarray()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y,shuffle=True)

In [22]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

models = [('GNB',GaussianNB()),('BNB',BernoulliNB()),('MNB',MultinomialNB())]
results = pd.DataFrame(columns=['Model','Accuracy','Precision','Recall','F1 Score','ROC AUC'])

for name,model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred,average='weighted')
    rec = recall_score(y_test,y_pred,average='weighted')
    f1 = f1_score(y_test,y_pred,average='weighted')
    roc_auc = roc_auc_score(y_test,y_pred)
    # Adding the results to the dataframe without appending
    results.loc[len(results)] = [name,acc,prec,rec,f1,roc_auc]
    print('Model: ',name)
    print('Confusion Matrix: \n',confusion_matrix(y_test,y_pred))
    print('Classification Report: \n',classification_report(y_test,y_pred))
    
results

Model:  GNB
Confusion Matrix: 
 [[ 954  176]
 [ 269 1527]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.78      0.84      0.81      1130
           1       0.90      0.85      0.87      1796

    accuracy                           0.85      2926
   macro avg       0.84      0.85      0.84      2926
weighted avg       0.85      0.85      0.85      2926

Model:  BNB
Confusion Matrix: 
 [[ 905  225]
 [ 175 1621]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.80      0.82      1130
           1       0.88      0.90      0.89      1796

    accuracy                           0.86      2926
   macro avg       0.86      0.85      0.85      2926
weighted avg       0.86      0.86      0.86      2926

Model:  MNB
Confusion Matrix: 
 [[ 868  262]
 [ 144 1652]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.77      

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,GNB,0.847915,0.851621,0.847915,0.8489,0.847235
1,BNB,0.863295,0.862608,0.863295,0.862687,0.851723
2,MNB,0.861244,0.861026,0.861244,0.859629,0.843982


In [11]:
# Using a word2vec model for sentiment analysis

from gensim.models import Word2Vec

# Creating a list of reviews_for_word2vec

reviews_for_word2vec = []

for i in range(len(data)):
    reviews_for_word2vec.append(data['Review'][i].split())

In [15]:
# Creating a word2vec model

word2vec_model = Word2Vec(reviews_for_word2vec,min_count=1,vector_size=150,window=5,workers=4)

# Creating a list of vectors for each review

vectors = []

for i in range(len(reviews_for_word2vec)):
    vectors.append(np.sum(word2vec_model.wv[reviews_for_word2vec[i]],axis=0))
    
vectors = np.array(vectors)

vectors.shape

(14629, 150)

In [16]:
# Model Building

X = vectors

y = data["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True
)

from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

models = [
    ("GNB", GaussianNB()),
    ("BNB", BernoulliNB()),
    ("SVC", SVC(random_state=101)),
    ("RFC", RandomForestClassifier(random_state=101)),
    ("ETC", ExtraTreesClassifier(random_state=101, n_jobs=-1)),
    ("LR", LogisticRegression(n_jobs=-1)),
    ("KNN", KNeighborsClassifier(n_jobs=-1)),
    ("XGB", XGBClassifier(random_state=101, n_jobs=-1)),
    ("CBC", CatBoostClassifier(random_state=101, verbose=0)),
]

results = pd.DataFrame(
    columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC Score"]
)

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    # Adding the results to the dataframe without appending
    results.loc[len(results)] = [name, acc, prec, rec, f1, roc_auc]

results.set_index("Model").sort_values(by="ROC AUC Score", ascending=False)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LR,0.853042,0.867204,0.898107,0.882385,0.839761
SVC,0.847573,0.85489,0.905345,0.879394,0.830549
CBC,0.839371,0.853035,0.891982,0.872074,0.823867
XGB,0.832878,0.847422,0.887528,0.867011,0.816773
RFC,0.831852,0.842797,0.892539,0.866955,0.813969
ETC,0.831169,0.836608,0.900891,0.86756,0.810622
KNN,0.805537,0.814132,0.885301,0.848226,0.782031
BNB,0.776828,0.835979,0.791759,0.813269,0.772428
GNB,0.680451,0.691759,0.864699,0.768622,0.626155
