In [1]:
# Importing basic libraries

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Importing libraries for sentiment analysis LSTM model

import tensorflow as tf
import keras
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, precision_score, recall_score


In [2]:
# Reading the dataset

data = pd.read_csv('airline_df_nlped.csv')

In [3]:
data.head()

Unnamed: 0,Country,Airline,Review,Cleaned_Review,Sentiment,Review2,Cleaned_Review2
0,China,Air China,los angeles to beijing return. food low qualit...,los angeles to beijing return . food low quali...,Negative,los angeles to beijing return. food low qualit...,los angeles to beijing return . food low quali...
1,China,Air China,round to trip from hong kong to munich. the ma...,round to trip from hong kong to munich . the m...,Negative,round to trip from hong kong to munich. the ma...,round to trip from hong kong to munich . the m...
2,China,Air China,sydney to beijing to paris then rome to beijin...,sydney to beijing to paris then rome to beijin...,Negative,sydney to beijing to paris then rome to beijin...,sydney to beijing to paris then rome to beijin...
3,China,Air China,london to sydney return via beijing. a cheap f...,london to sydney return via beijing . a cheap ...,Negative,london to sydney return via beijing. a cheap f...,london to sydney return via beijing . a cheap ...
4,China,Air China,beijing to shanghai. only one check to in desk...,beijing to shanghai . only one check to in des...,Positive,beijing to shanghai. only one check to in desk...,beijing to shanghai . only one check to in des...


In [4]:
data.shape

(15206, 7)

In [5]:
data['Review2'][0]

'los angeles to beijing return. food low quality staff appeared when time for mandatory service large screen viewing on main bulkhead without sound on both trips. they were older 747s. it seems staff had something to say at irregular intervals making it hard to try and sleep but they were very good at making business transactions ignoring everyone so they could count money and look at receipts. you want a cheap very basic airline trip to china then this is the airline for you.'

In [6]:
data['Cleaned_Review2'][0]

'los angeles to beijing return . food low quality staff appear when time for mandatory service large screen view on main bulkhead without sound on both trip . they be old . it seem staff have something to say at irregular interval make it hard to try and sleep but they be very good at make business transaction ignore everyone so they could count money and look at receipt . you want a cheap very basic airline trip to china then this be the airline for you .'

In [7]:
data.isna().sum()

Country            0
Airline            0
Review             0
Cleaned_Review     0
Sentiment          0
Review2            0
Cleaned_Review2    0
dtype: int64

In [18]:
# Model Building

X = data['Review2']

y = data['Sentiment']

y = y.map({'Positive':1,'Negative':0})

cv = CountVectorizer(max_features=10000,ngram_range=(1,1),stop_words='english')

X = cv.fit_transform(X).toarray()

# vec = TfidfVectorizer(max_features=10000,ngram_range=(1,1),stop_words='english')

# X = vec.fit_transform(X).toarray()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y,shuffle=True)

In [69]:
# Creating a Word2Vec model for model building

from gensim.models import Word2Vec

sentences = [row.split() for row in data['Cleaned_Review']]

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [70]:
model.wv.most_similar('lunch')

[('dinner', 0.9290294647216797),
 ('breakfast', 0.90855872631073),
 ('snack', 0.8390840291976929),
 ('tea', 0.7981657981872559),
 ('dessert', 0.7878466844558716),
 ('refreshment', 0.7855371832847595),
 ('coffee', 0.7838377356529236),
 ('champagne', 0.782963752746582),
 ('sandwich', 0.7777277827262878),
 ('supper', 0.7554696798324585)]

In [19]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

models = [('GNB',GaussianNB()),('BNB',BernoulliNB()),('MNB',MultinomialNB())]

In [20]:
results = pd.DataFrame(columns=['Model','Accuracy','Precision','Recall','F1 Score','ROC AUC Score'])

for name,model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred)
    rec = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    roc_auc = roc_auc_score(y_test,y_pred)
    # Adding the results to the dataframe without appending
    results.loc[len(results)] = [name,acc,prec,rec,f1,roc_auc]
    
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC Score
0,GNB,0.749507,0.792767,0.774972,0.783768,0.744232
1,BNB,0.859303,0.85445,0.915825,0.884074,0.847595
2,MNB,0.868508,0.866773,0.916386,0.890889,0.85859


In [55]:
# Fingding max length of a sentence

max_len = 0
max_len2 = 0
for row in data['Cleaned_Review']:
    if len(row.split()) > max_len:
        max_len = len(row.split())
        
for row in data['Cleaned_Review2']:
    if len(row.split()) > max_len2:
        max_len2 = len(row.split())
        
print(max_len)
print(max_len2)

774
757


In [None]:
# using word2vec model for model building

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000)

tokenizer.fit_on_texts(data['Cleaned_Review'])