# Sentiment Analysis of Airline Reviews

In [11]:

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_lg')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from textblob import TextBlob 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from mappings import contraction_mapping,chat_words_replacements

In [12]:
nltk_sw = nltk.corpus.stopwords.words('english')
spacy_sw = nlp.Defaults.stop_words
stop_words = set(nltk_sw).union(spacy_sw)

# Removing some words from stop words because of sentiment analysis such as no, not
stop_words.remove('no')
stop_words.remove('not')

# Panda display options
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)

print(len(stop_words))

380


In [13]:
df = pd.read_csv('airline_df_cleaned.csv')
reviews = df[['Country','Airline','Review','Sentiment']].copy(deep=True)
reviews.head()

Unnamed: 0,Country,Airline,Review,Sentiment
0,China,Air China,los angeles to beijing return. food low qualit...,Negative
1,China,Air China,round to trip from hong kong to munich. the ma...,Negative
2,China,Air China,sydney to beijing to paris then rome to beijin...,Negative
3,China,Air China,london to sydney return via beijing. a cheap f...,Negative
4,China,Air China,beijing to shanghai. only one check to in desk...,Positive


In [14]:
reviews['Review'][60]

'hong kong to beijing to tokyo 12 12 12. first flight a321 cabin crew outstanding although the english speaking fa had a very heavy accent which made it a little difficult to understand her. service very attentive they actually opened the slippers and blanket pouches for you. meal service came and they opened the tray tables for you. breakfast however was watery eggs tasteless chicken sausage and spinach. in the past i have always found catering from hong kong to be outstanding but this was far from what is usual. transit in beijing was good arrived and departed from hard stands business class passengers had their own minibus to take them to and from the terminal. however transit in beijing meant going through an immigration check and there was only one open desk which the immigration officer took 10 mins or more on the passengers ahead of me on line. beijing to tokyo on 738 with only 8 seats. catering was sub par the steak was overcooked and the appetizers were brown tofu and dried fi

In [15]:
# Lemmatizing the words in each review using Spacy

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token not in stop_words])

reviews['Cleaned_Review'] = reviews['Review'].apply(lemmatize_text)


In [16]:
reviews['Review'][60]

'hong kong to beijing to tokyo 12 12 12. first flight a321 cabin crew outstanding although the english speaking fa had a very heavy accent which made it a little difficult to understand her. service very attentive they actually opened the slippers and blanket pouches for you. meal service came and they opened the tray tables for you. breakfast however was watery eggs tasteless chicken sausage and spinach. in the past i have always found catering from hong kong to be outstanding but this was far from what is usual. transit in beijing was good arrived and departed from hard stands business class passengers had their own minibus to take them to and from the terminal. however transit in beijing meant going through an immigration check and there was only one open desk which the immigration officer took 10 mins or more on the passengers ahead of me on line. beijing to tokyo on 738 with only 8 seats. catering was sub par the steak was overcooked and the appetizers were brown tofu and dried fi

In [17]:
# Tokenizing the each review using word_tokenize
corpus = []
for i in range(0, len(reviews)):
    review = word_tokenize(reviews['Cleaned_Review'][i])
    review = ' '.join(review)
    corpus.append(review)

In [18]:
corpus = [word_tokenize(x)for x in corpus]
corpus

[['los',
  'angeles',
  'to',
  'beijing',
  'return',
  '.',
  'food',
  'low',
  'quality',
  'staff',
  'appear',
  'when',
  'time',
  'for',
  'mandatory',
  'service',
  'large',
  'screen',
  'view',
  'on',
  'main',
  'bulkhead',
  'without',
  'sound',
  'on',
  'both',
  'trip',
  '.',
  'they',
  'be',
  'old',
  '747',
  '.',
  'it',
  'seem',
  'staff',
  'have',
  'something',
  'to',
  'say',
  'at',
  'irregular',
  'interval',
  'make',
  'it',
  'hard',
  'to',
  'try',
  'and',
  'sleep',
  'but',
  'they',
  'be',
  'very',
  'good',
  'at',
  'make',
  'business',
  'transaction',
  'ignore',
  'everyone',
  'so',
  'they',
  'could',
  'count',
  'money',
  'and',
  'look',
  'at',
  'receipt',
  '.',
  'you',
  'want',
  'a',
  'cheap',
  'very',
  'basic',
  'airline',
  'trip',
  'to',
  'china',
  'then',
  'this',
  'be',
  'the',
  'airline',
  'for',
  'you',
  '.'],
 ['round',
  'to',
  'trip',
  'from',
  'hong',
  'kong',
  'to',
  'munich',
  '.',
  't

In [19]:
# Building the Word2Vec model

gensim_model = Word2Vec(vector_size=100, window=5)

gensim_model.build_vocab(corpus)

gensim_model.train(corpus, total_examples=gensim_model.corpus_count, epochs=gensim_model.epochs)


(7213896, 10809405)

In [21]:

gensim_model.wv.most_similar('flight')

[('journey', 0.6472281217575073),
 ('sector', 0.5140012502670288),
 ('trip', 0.4332040548324585),
 ('fight', 0.42624303698539734),
 ('route', 0.4261952042579651),
 ('layover', 0.41872575879096985),
 ('plane', 0.38135018944740295),
 ('way', 0.3807032108306885),
 ('leg', 0.3751600384712219),
 ('segment', 0.3750954568386078)]

In [22]:

gensim_model.wv.most_similar('seat')

[('seating', 0.7850421667098999),
 ('position', 0.6236416101455688),
 ('recline', 0.5924221873283386),
 ('row', 0.588665783405304),
 ('bed', 0.5837993621826172),
 ('legroom', 0.5776554942131042),
 ('window', 0.5595418214797974),
 ('headrest', 0.541667640209198),
 ('foot', 0.5335834622383118),
 ('sit', 0.5330264568328857)]

In [23]:

gensim_model.wv.most_similar('legroom')

[('pitch', 0.8216732144355774),
 ('width', 0.8067827820777893),
 ('seating', 0.7688928842544556),
 ('space', 0.7631911039352417),
 ('privacy', 0.7551858425140381),
 ('spacious', 0.7104053497314453),
 ('cushion', 0.6939931511878967),
 ('legspace', 0.6906117796897888),
 ('storage', 0.6772264242172241),
 ('roomy', 0.6742158532142639)]

In [24]:
gensim_model.wv.most_similar('pilot')

[('captain', 0.8906248211860657),
 ('announcement', 0.6703783273696899),
 ('instruction', 0.6073117256164551),
 ('cockpit', 0.6060876846313477),
 ('purser', 0.6016950607299805),
 ('stewardess', 0.5874086022377014),
 ('supervisor', 0.5641340613365173),
 ('turbulence', 0.5628186464309692),
 ('defect', 0.5423609018325806),
 ('engine', 0.5356821417808533)]

In [25]:
gensim_model.wv.most_similar('china')

[('india', 0.903915524482727),
 ('korean', 0.8024343848228455),
 ('vietjet', 0.8021036386489868),
 ('lion', 0.7928688526153564),
 ('canada', 0.7883039116859436),
 ('france', 0.7666332125663757),
 ('silk', 0.7016123533248901),
 ('oman', 0.7007150650024414),
 ('asia', 0.6868889927864075),
 ('conditioning', 0.6707914471626282)]

In [20]:
reviews.to_csv('airline_df_nlped.csv',index=False)