# Sentiment Analysis of Airline Reviews

In [1]:

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_lg')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from textblob import TextBlob 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from mappings import contraction_mapping,chat_words_replacements

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk_sw = nltk.corpus.stopwords.words('english')
spacy_sw = nlp.Defaults.stop_words
stop_words = set(nltk_sw).union(spacy_sw)

# Removing some words from stop words because of sentiment analysis such as no, not
stop_words.remove('no')
stop_words.remove('not')

# Panda display options
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)

print(stop_words)
print(len(stop_words))

{'may', 'none', "needn't", 'nevertheless', 'amongst', 'between', 'hasn', "'re", 'twenty', 'whatever', 'whoever', 'enough', 'else', 'it', 'via', 'seems', 'sixty', 'yet', 'ours', 'empty', 'weren', 'them', 'won', 'because', 'we', 'anything', 'neither', 'why', 'make', 'above', 'elsewhere', 'among', '‘m', '‘re', 'hundred', 'side', "you're", 'didn', "isn't", 'having', 'to', 'become', 'i', 'hadn', 'wherein', 'nowhere', 'about', 'haven', 'put', 'quite', 'seem', "you've", 'though', 'never', 'mightn', "you'll", '’m', 'those', 'y', 'further', 'me', 'out', 'sometimes', 'again', 'thru', 'down', 'by', 'm', 'each', 'part', 'll', 'due', 'latterly', 'such', 'nine', 'please', 'herein', 'a', 'perhaps', "mightn't", 'nothing', 'an', 're', 'might', 'then', 'even', 'shouldn', 'without', 'through', 'much', 'needn', 'whereby', 'been', 'becoming', 'sometime', '’d', 'another', 'made', 'once', 'he', 'being', 'of', 'hereby', 'their', 'there', 'nor', 'as', 'what', 'less', 'themselves', 'around', 'all', 'could', 'be

In [3]:
df = pd.read_csv('airline_df_cleaned.csv')
reviews = df[['Country','Airline','Review']].copy(deep=True)
reviews.head()

Unnamed: 0,Country,Airline,Review
0,China,Air China,los angeles to beijing return. food low qualit...
1,China,Air China,round to trip from hong kong to munich. the ma...
2,China,Air China,sydney to beijing to paris then rome to beijin...
3,China,Air China,london to sydney return via beijing. a cheap f...
4,China,Air China,beijing to shanghai. only one check to in desk...


In [4]:
reviews['Review'][60]

'hong kong to beijing to tokyo 12 12 12. first flight a321 cabin crew outstanding although the english speaking fa had a very heavy accent which made it a little difficult to understand her. service very attentive they actually opened the slippers and blanket pouches for you. meal service came and they opened the tray tables for you. breakfast however was watery eggs tasteless chicken sausage and spinach. in the past i have always found catering from hong kong to be outstanding but this was far from what is usual. transit in beijing was good arrived and departed from hard stands business class passengers had their own minibus to take them to and from the terminal. however transit in beijing meant going through an immigration check and there was only one open desk which the immigration officer took 10 mins or more on the passengers ahead of me on line. beijing to tokyo on 738 with only 8 seats. catering was sub par the steak was overcooked and the appetizers were brown tofu and dried fi

In [5]:
# Lemmatizing the words in each review using Spacy

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token not in stop_words])

reviews['Cleaned_Review'] = reviews['Review'].apply(lemmatize_text)


In [6]:
def words_only(text):
    return " ".join(re.findall(r'[A-Za-z]+', text))

reviews['Review2'] = reviews['Cleaned_Review'].apply(lambda x: words_only(x))

reviews['Review2'][60]

'hong kong to beijing to tokyo first flight a cabin crew outstanding although the english speak fa have a very heavy accent which make it a little difficult to understand she service very attentive they actually open the slipper and blanket pouch for you meal service come and they open the tray table for you breakfast however be watery egg tasteless chicken sausage and spinach in the past I have always find cater from hong kong to be outstanding but this be far from what be usual transit in beijing be good arrive and depart from hard stand business class passenger have their own minibus to take they to and from the terminal however transit in beijing mean go through an immigration check and there be only one open desk which the immigration officer take min or more on the passenger ahead of I on line beijing to tokyo on with only seat catering be sub par the steak be overcooked and the appetizer be brown tofu and dry fish again though the cabin crew be excellent'

In [7]:
reviews['Review'][60]

'hong kong to beijing to tokyo 12 12 12. first flight a321 cabin crew outstanding although the english speaking fa had a very heavy accent which made it a little difficult to understand her. service very attentive they actually opened the slippers and blanket pouches for you. meal service came and they opened the tray tables for you. breakfast however was watery eggs tasteless chicken sausage and spinach. in the past i have always found catering from hong kong to be outstanding but this was far from what is usual. transit in beijing was good arrived and departed from hard stands business class passengers had their own minibus to take them to and from the terminal. however transit in beijing meant going through an immigration check and there was only one open desk which the immigration officer took 10 mins or more on the passengers ahead of me on line. beijing to tokyo on 738 with only 8 seats. catering was sub par the steak was overcooked and the appetizers were brown tofu and dried fi

In [8]:
# Tokenizing the each review using word_tokenize
corpus = []
for i in range(0, len(reviews)):
    review = word_tokenize(reviews['Cleaned_Review'][i])
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus = [word_tokenize(x)for x in corpus]
corpus

[['los',
  'angeles',
  'to',
  'beijing',
  'return',
  '.',
  'food',
  'low',
  'quality',
  'staff',
  'appear',
  'when',
  'time',
  'for',
  'mandatory',
  'service',
  'large',
  'screen',
  'view',
  'on',
  'main',
  'bulkhead',
  'without',
  'sound',
  'on',
  'both',
  'trip',
  '.',
  'they',
  'be',
  'old',
  '747',
  '.',
  'it',
  'seem',
  'staff',
  'have',
  'something',
  'to',
  'say',
  'at',
  'irregular',
  'interval',
  'make',
  'it',
  'hard',
  'to',
  'try',
  'and',
  'sleep',
  'but',
  'they',
  'be',
  'very',
  'good',
  'at',
  'make',
  'business',
  'transaction',
  'ignore',
  'everyone',
  'so',
  'they',
  'could',
  'count',
  'money',
  'and',
  'look',
  'at',
  'receipt',
  '.',
  'you',
  'want',
  'a',
  'cheap',
  'very',
  'basic',
  'airline',
  'trip',
  'to',
  'china',
  'then',
  'this',
  'be',
  'the',
  'airline',
  'for',
  'you',
  '.'],
 ['round',
  'to',
  'trip',
  'from',
  'hong',
  'kong',
  'to',
  'munich',
  '.',
  't

In [10]:
# Building the Word2Vec model

gensim_model = Word2Vec(vector_size=100, window=5)

gensim_model.build_vocab(corpus)

gensim_model.train(corpus, total_examples=gensim_model.corpus_count, epochs=gensim_model.epochs)


(7212406, 10809405)

In [11]:

gensim_model.wv.most_similar('flight')

[('journey', 0.628365159034729),
 ('sector', 0.5350983738899231),
 ('segment', 0.4535638391971588),
 ('trip', 0.4309930205345154),
 ('plane', 0.41944006085395813),
 ('route', 0.41115692257881165),
 ('connection', 0.4043567180633545),
 ('layover', 0.3980632722377777),
 ('way', 0.37593039870262146),
 ('leg', 0.36290422081947327)]

In [12]:

gensim_model.wv.most_similar('seat')

[('seating', 0.74801105260849),
 ('position', 0.6118561029434204),
 ('foot', 0.5807281732559204),
 ('recline', 0.5806661248207092),
 ('row', 0.5783480405807495),
 ('bed', 0.5763125419616699),
 ('window', 0.5645354390144348),
 ('section', 0.5632432103157043),
 ('headrest', 0.5509124994277954),
 ('legroom', 0.5350437760353088)]

In [13]:

gensim_model.wv.most_similar('legroom')

[('pitch', 0.8322235345840454),
 ('width', 0.809935450553894),
 ('space', 0.7636929750442505),
 ('privacy', 0.7523685097694397),
 ('seating', 0.7330620884895325),
 ('legspace', 0.7146245241165161),
 ('padding', 0.6932995915412903),
 ('spacious', 0.6881359815597534),
 ('footrest', 0.6867506504058838),
 ('storage', 0.6730130910873413)]

In [14]:

gensim_model.wv.most_similar('turbulence')

[('announcement', 0.6088939905166626),
 ('pilot', 0.5695245265960693),
 ('runway', 0.541848361492157),
 ('pandemic', 0.5240767598152161),
 ('fuss', 0.5199780464172363),
 ('captain', 0.5151050090789795),
 ('hit', 0.5056131482124329),
 ('loud', 0.5052484273910522),
 ('sign', 0.5041467547416687),
 ('unusable', 0.49628692865371704)]

In [15]:

gensim_model.wv.most_similar('accommodation')

[('hotel', 0.7772146463394165),
 ('compensation', 0.7032557725906372),
 ('voucher', 0.6587042212486267),
 ('transportation', 0.6485440135002136),
 ('visa', 0.6371828317642212),
 ('alternative', 0.5680559873580933),
 ('accomodation', 0.5598676204681396),
 ('further', 0.5525864362716675),
 ('pullman', 0.5473583340644836),
 ('solution', 0.5426724553108215)]

In [16]:
gensim_model.wv.most_similar('passenger')

[('people', 0.7535462379455566),
 ('pax', 0.7096782326698303),
 ('everyone', 0.5955319404602051),
 ('guest', 0.5390135645866394),
 ('traveller', 0.5148500800132751),
 ('woman', 0.4910232722759247),
 ('curtain', 0.46812739968299866),
 ('those', 0.46643444895744324),
 ('pilot', 0.4639133810997009),
 ('doctor', 0.4573633372783661)]

In [17]:
gensim_model.wv.most_similar('airhostess')

[('thoughtful', 0.7634063959121704),
 ('nature', 0.7368196845054626),
 ('empathetic', 0.7167608737945557),
 ('obliging', 0.7115992307662964),
 ('chatty', 0.7066141366958618),
 ('engaging', 0.7051592469215393),
 ('caring', 0.7001773715019226),
 ('demeanor', 0.6972114443778992),
 ('enthusiasm', 0.6950669288635254),
 ('casual', 0.6814704537391663)]

In [18]:
gensim_model.wv.most_similar('pilot')

[('captain', 0.8869269490242004),
 ('announcement', 0.6928438544273376),
 ('purser', 0.6410172581672668),
 ('instruction', 0.6344566345214844),
 ('sign', 0.6004919409751892),
 ('information', 0.5893152356147766),
 ('cockpit', 0.5780603289604187),
 ('supervisor', 0.5726273655891418),
 ('turbulence', 0.5695245862007141),
 ('safety', 0.5477361679077148)]

In [19]:
gensim_model.wv.most_similar('airline')

[('carrier', 0.8016940951347351),
 ('company', 0.7684745788574219),
 ('airlines', 0.6855190992355347),
 ('emirate', 0.580158531665802),
 ('traveler', 0.5464925169944763),
 ('sq', 0.5464203953742981),
 ('ana', 0.5452247262001038),
 ('sia', 0.5315968990325928),
 ('spicejet', 0.5236966013908386),
 ('country', 0.5143282413482666)]

In [20]:
gensim_model.wv.most_similar('aircraft')

[('plane', 0.8655164837837219),
 ('airplane', 0.7949374318122864),
 ('lavatory', 0.6555835604667664),
 ('dreamliner', 0.609307587146759),
 ('brand', 0.5884124040603638),
 ('a350', 0.5860671401023865),
 ('washroom', 0.5835716128349304),
 ('747', 0.5826417803764343),
 ('787', 0.577915370464325),
 ('737', 0.5732976198196411)]

In [21]:
gensim_model.wv.most_similar('china',topn=20)

[('india', 0.8975402116775513),
 ('vietjet', 0.8172510862350464),
 ('korean', 0.8117948174476624),
 ('lion', 0.8074207901954651),
 ('canada', 0.7746709585189819),
 ('france', 0.7696030735969543),
 ('conditioning', 0.7195947766304016),
 ('oman', 0.7054590582847595),
 ('conditioner', 0.6983872652053833),
 ('asia', 0.6796544790267944),
 ('silk', 0.6791822910308838),
 ('eva', 0.6527798175811768),
 ('host', 0.6377550959587097),
 ('bridge', 0.6070753335952759),
 ('hostess', 0.5840795040130615),
 ('vent', 0.5815262794494629),
 ('freshener', 0.5564339756965637),
 ('cs', 0.5483009219169617),
 ('nz', 0.5440197587013245),
 ('craft', 0.5434877276420593)]

In [22]:
gensim_model.wv.most_similar('japan',topn=20)

[('vietnam', 0.7844059467315674),
 ('united', 0.7663232684135437),
 ('hainan', 0.7489904165267944),
 ('singapore', 0.7360861301422119),
 ('malaysia', 0.7326321601867676),
 ('turkish', 0.6946418285369873),
 ('indigo', 0.6915834546089172),
 ('american', 0.682246744632721),
 ('vistara', 0.6478980183601379),
 ('british', 0.6448171734809875),
 ('australia', 0.6335635781288147),
 ('ana', 0.632028341293335),
 ('dl', 0.6221900582313538),
 ('thai', 0.6061016917228699),
 ('african', 0.6035909652709961),
 ('asiana', 0.6025446057319641),
 ('lufthansa', 0.599966287612915),
 ('jetstar', 0.5986993312835693),
 ('jal', 0.5910171866416931),
 ('europe', 0.5891055464744568)]

In [23]:
reviews.to_csv('airline_df_nlped.csv',index=False)