# Sentiment Analysis of Airline Reviews

In [1]:

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_trf')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from textblob import TextBlob 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from mappings import contraction_mapping,chat_words_replacements

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk_sw = nltk.corpus.stopwords.words('english')
spacy_sw = nlp.Defaults.stop_words
stop_words = set(nltk_sw).union(spacy_sw)

# Removing some words from stop words because of sentiment analysis such as no, not
stop_words.remove('no')
stop_words.remove('not')

# Panda display options
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)

print(stop_words)
print(len(stop_words))

{'amongst', 'didn', 'for', 'she', 'same', 'your', 'himself', 'at', "'ll", 'he', 'were', 'their', 'would', 'but', 'a', "shan't", 'and', 'off', 'empty', "you've", 'perhaps', 'therefore', 'ten', 'i', 'within', 'hereafter', 'been', 'enough', 'myself', 'won', 'noone', 'elsewhere', 'make', '‘s', 'mustn', 'former', 'am', 'each', 'then', 'least', 'above', 'from', 'ma', 'o', 'others', 'sometimes', 'wherever', 'less', 'doesn', 'into', 'please', 'in', 'several', 'although', 'latterly', 'towards', 'm', 'whatever', 'have', 'an', 'twelve', 'anyone', 'formerly', '‘ve', 'hers', 'y', "weren't", 'n‘t', 'anyway', 'very', 'yours', 'yourself', 'down', '‘ll', 'move', 'next', 'keep', '’ve', 'anything', 'our', 'ca', 'somehow', 'me', 'my', 'namely', 'must', 'after', 'where', "that'll", 'who', 'done', 'everything', 'latter', 'before', 'whereas', 'whose', 'put', "mustn't", 'couldn', 'six', 're', 'herself', 'these', 'never', 'part', "'ve", '‘re', 'afterwards', 'had', 'together', 'made', 'thru', "wouldn't", 'shan'

In [3]:
df = pd.read_csv('airline_df_cleaned.csv')
reviews = df[['Country','Airline','Review']].copy(deep=True)
reviews.head()

Unnamed: 0,Country,Airline,Review
0,China,Air China,los angeles to beijing return. food low qualit...
1,China,Air China,round to trip from hong kong to munich. the ma...
2,China,Air China,sydney to beijing to paris then rome to beijin...
3,China,Air China,london to sydney return via beijing. a cheap f...
4,China,Air China,beijing to shanghai. only one check to in desk...


In [4]:
reviews['Review'][60]

'hong kong to beijing to tokyo 12 12 12. first flight a321 cabin crew outstanding although the english speaking fa had a very heavy accent which made it a little difficult to understand her. service very attentive they actually opened the slippers and blanket pouches for you. meal service came and they opened the tray tables for you. breakfast however was watery eggs tasteless chicken sausage and spinach. in the past i have always found catering from hong kong to be outstanding but this was far from what is usual. transit in beijing was good arrived and departed from hard stands business class passengers had their own minibus to take them to and from the terminal. however transit in beijing meant going through an immigration check and there was only one open desk which the immigration officer took 10 mins or more on the passengers ahead of me on line. beijing to tokyo on 738 with only 8 seats. catering was sub par the steak was overcooked and the appetizers were brown tofu and dried fi

In [9]:
# Lemmatizing the text
def stemmer(text):
    text = ' '.join([word for word in text.split() if word not in stop_words])
    stemm = SnowballStemmer('english')
    text = ' '.join([stemm.stem(word) for word in text.split()])
    return text

# Lemmatizing the text

def lemmatizer(text):
    text = ' '.join([word for word in text.split() if word not in stop_words])
    lemm = WordNetLemmatizer()
    text = ' '.join([lemm.lemmatize(word) for word in text.split()])
    return text

# Removing the special characters

reviews['Review'] = reviews['Review'].apply(lambda x: lemmatizer(x))
reviews['Review'] = reviews['Review'].apply(lambda x: stemmer(x))

In [10]:
def words_only(text):
    return " ".join(re.findall(r'[A-Za-z]+', text))

reviews['Review'] = reviews['Review'].apply(lambda x: words_only(x))

reviews['Review'][60]

'hong kong beij tokyo flight a cabin crew outstand english speak fa heavi accent littl difficult understand her servic attent actual open slipper blanket pouch you meal servic came open tray tabl you breakfast wateri egg tasteless chicken sausag spinach past found cater hong kong outstand far usual transit beij good arriv depart hard stand busi class passeng minibus terminal transit beij meant go immigr check open desk immigr offic took min passeng ahead line beij tokyo seats cater sub par steak overcook appet brown tofu dri fish cabin crew excellent'

In [11]:
# Tokenizing the each review using word_tokenize
corpus = []
for i in range(0, len(reviews)):
    review = word_tokenize(reviews['Review'][i])
    review = ' '.join(review)
    corpus.append(review)

In [12]:
corpus = [word_tokenize(x)for x in corpus]
corpus

[['los',
  'angel',
  'beij',
  'return',
  'food',
  'low',
  'qualiti',
  'staff',
  'appear',
  'time',
  'mandatori',
  'servic',
  'larg',
  'screen',
  'view',
  'main',
  'bulkhead',
  'sound',
  'trips',
  'older',
  's',
  'staff',
  'irregular',
  'interv',
  'make',
  'hard',
  'tri',
  'sleep',
  'good',
  'make',
  'busi',
  'transact',
  'ignor',
  'count',
  'money',
  'look',
  'receipts',
  'want',
  'cheap',
  'basic',
  'airlin',
  'trip',
  'china',
  'airlin',
  'you'],
 ['round',
  'trip',
  'hong',
  'kong',
  'munich',
  'main',
  'reason',
  'fli',
  'air',
  'china',
  'no',
  'surpris',
  'price',
  'best',
  'good',
  'thing',
  'flight',
  'food',
  'atroci',
  'plane',
  'worn',
  'call',
  'entertain',
  'system',
  'awful',
  'bathroom',
  'bad',
  'conclus',
  'prioriti',
  'time',
  'pay',
  'extra',
  'fli',
  'airline'],
 ['sydney',
  'beij',
  'pari',
  'rome',
  'beij',
  'sydney',
  'famili',
  'holiday',
  'hr',
  'flights',
  'said',
  'price',


In [13]:
# Building the Word2Vec model

gensim_model = Word2Vec(vector_size=100, window=5)

gensim_model.build_vocab(corpus)

gensim_model.train(corpus, total_examples=gensim_model.corpus_count, epochs=gensim_model.epochs)


(4102996, 4857335)

In [14]:

gensim_model.wv.most_similar('flight')

[('flights', 0.7022088170051575),
 ('journey', 0.5176708698272705),
 ('sector', 0.5150142908096313),
 ('late', 0.4402056634426117),
 ('time', 0.43927982449531555),
 ('schedul', 0.43453145027160645),
 ('layov', 0.4285428524017334),
 ('guangzhou', 0.4278758466243744),
 ('reason', 0.42210084199905396),
 ('plane', 0.41959622502326965)]

In [15]:

gensim_model.wv.most_similar('seat')

[('seats', 0.8993329405784607),
 ('row', 0.7268254160881042),
 ('fulli', 0.7041015625),
 ('window', 0.6903613805770874),
 ('aisl', 0.6549650430679321),
 ('privaci', 0.6510903239250183),
 ('tall', 0.6486971974372864),
 ('bulkhead', 0.6412966847419739),
 ('ft', 0.63712078332901),
 ('reclin', 0.6293007731437683)]

In [16]:

gensim_model.wv.most_similar('legroom')

[('pitch', 0.9327129125595093),
 ('width', 0.8544057011604309),
 ('space', 0.8353104591369629),
 ('seating', 0.8233251571655273),
 ('wider', 0.8154447674751282),
 ('cramped', 0.7998308539390564),
 ('inch', 0.7963926792144775),
 ('cramp', 0.796027660369873),
 ('narrow', 0.7920704483985901),
 ('cushion', 0.7887054681777954)]

In [17]:

gensim_model.wv.most_similar('turbulence')

[('dust', 0.8984588980674744),
 ('annoying', 0.8981080055236816),
 ('depth', 0.8937667608261108),
 ('cleaning', 0.891815185546875),
 ('unusable', 0.8911714553833008),
 ('flip', 0.890113115310669),
 ('ears', 0.8865798115730286),
 ('climat', 0.8860422372817993),
 ('slant', 0.8860230445861816),
 ('lid', 0.8833178281784058)]

In [18]:

gensim_model.wv.most_similar('accommodation')

[('moral', 0.8917257189750671),
 ('quarantin', 0.874610960483551),
 ('uae', 0.8733111619949341),
 ('hkd', 0.8698779940605164),
 ('arrangements', 0.8690431714057922),
 ('ordeal', 0.8679969906806946),
 ('cancellation', 0.8656245470046997),
 ('upcom', 0.8650971055030823),
 ('court', 0.8647080659866333),
 ('country', 0.8633264899253845)]

In [19]:
gensim_model.wv.most_similar('passenger')

[('men', 0.836449384689331),
 ('cane', 0.8194712400436401),
 ('agents', 0.8123810887336731),
 ('compli', 0.811948835849762),
 ('spite', 0.8104616403579712),
 ('myself', 0.804705798625946),
 ('volunt', 0.8028032779693604),
 ('name', 0.8013089299201965),
 ('somebodi', 0.7974607348442078),
 ('beg', 0.7942280173301697)]

In [20]:
gensim_model.wv.most_similar('airhostess')

[('director', 0.8940794467926025),
 ('passion', 0.8828932046890259),
 ('requests', 0.8814606666564941),
 ('aircrew', 0.8566915988922119),
 ('enthusiasm', 0.8566388487815857),
 ('intrus', 0.8549571633338928),
 ('politely', 0.8520509600639343),
 ('unsmil', 0.8495770692825317),
 ('mister', 0.8485177159309387),
 ('fas', 0.8472004532814026)]

In [21]:
gensim_model.wv.most_similar('pilot')

[('captain', 0.9333938360214233),
 ('announc', 0.8736606240272522),
 ('turbul', 0.7761217951774597),
 ('cockpit', 0.7543299198150635),
 ('airborne', 0.744971752166748),
 ('descent', 0.7363412976264954),
 ('congest', 0.729268491268158),
 ('off', 0.7291664481163025),
 ('rush', 0.7182056307792664),
 ('runway', 0.7140573263168335)]

In [22]:
gensim_model.wv.most_similar('airline')

[('emirates', 0.8213285803794861),
 ('airlin', 0.8198407888412476),
 ('airlines', 0.7751621007919312),
 ('honest', 0.745065450668335),
 ('hesit', 0.7357279658317566),
 ('company', 0.7108476161956787),
 ('life', 0.7043099999427795),
 ('anyone', 0.701680600643158),
 ('compani', 0.7006176114082336),
 ('with', 0.6848890781402588)]

In [23]:
gensim_model.wv.most_similar('aircraft')

[('plane', 0.8686224818229675),
 ('airplan', 0.7993035316467285),
 ('lavatori', 0.6997485756874084),
 ('interior', 0.6739364862442017),
 ('maintained', 0.650669515132904),
 ('b', 0.6481779217720032),
 ('er', 0.6397911310195923),
 ('a', 0.6388198137283325),
 ('planes', 0.6299272179603577),
 ('boe', 0.6284643411636353)]

In [24]:
gensim_model.wv.most_similar('china',topn=20)

[('india', 0.8770084977149963),
 ('canada', 0.869855523109436),
 ('lion', 0.8695163130760193),
 ('franc', 0.8428460359573364),
 ('france', 0.8382934927940369),
 ('korean', 0.835110604763031),
 ('silk', 0.8323430418968201),
 ('oman', 0.8219142556190491),
 ('vietjet', 0.8168314695358276),
 ('eva', 0.7055134177207947),
 ('host', 0.6616425514221191),
 ('nz', 0.6536868214607239),
 ('woeful', 0.6437628269195557),
 ('bridg', 0.6140555739402771),
 ('asia', 0.5793871283531189),
 ('korea', 0.5735454559326172),
 ('bridge', 0.5589327216148376),
 ('vent', 0.5546151399612427),
 ('hostess', 0.544735848903656),
 ('con', 0.5261220932006836)]