# Sentiment Analysis of Airline Reviews

In [None]:

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from mappings import contraction_mapping,chat_words_replacements

In [2]:
nltk_sw = nltk.corpus.stopwords.words('english')
spacy_sw = nlp.Defaults.stop_words
stop_words = set(nltk_sw).union(spacy_sw)

# REmoving some words from stop words because of sentiment analysis such as no, not
stop_words.remove('no')
stop_words.remove('not')

# Panda display options
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv('airline_df_cleaned.csv')
reviews = df[['Country','Airline','Review']].copy(deep=True)
reviews.head()

Unnamed: 0,Country,Airline,Review
0,India,indigo-airlines,✅ Trip Verified | Flight was punctual. But no ...
1,India,indigo-airlines,"✅ Trip Verified | My sister, niece and mother..."
2,India,indigo-airlines,✅ Trip Verified | My 77-year-old father was fl...
3,India,indigo-airlines,Not Verified | IndiGo are a low cost airline ...
4,India,indigo-airlines,✅ Trip Verified | My flight 6e 1176 which was...


In [4]:
# Splitting the reviews on the basis of |
reviews['Review'] = reviews['Review'].apply(lambda x: x.lower())

def de_emojify(text):
    # Regular expression pattern to match emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002500-\U00002BEF"  # chinese characters
        "\U00002702-\U000027B0"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE,
    )

    # Remove emojis from the text
    de_emojified_text = emoji_pattern.sub(r"", text)

    return de_emojified_text

reviews['Review'] = reviews['Review'].apply(lambda x: de_emojify(x))
# Splitting on the basis of |
reviews['Review'] = reviews['Review'].apply(lambda x: x.split('|'))
# Checking the length of the reviews if its 2 then taking the last one and if its 1 then taking the first one
reviews['Review'] = reviews['Review'].apply(lambda x: x[-1] if len(x) == 2 else x[0])

# Removing the newlines, tabs or any other special characters
reviews['Review'] = reviews['Review'].apply(lambda x: re.sub(r'\n|\t|\r', '', x))

# Replacing the contractions with their expansions
reviews['Review'] = reviews['Review'].apply(lambda x: ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in x.split(" ")]))

# Replace the chat words with their expansions
reviews['Review'] = reviews['Review'].apply(lambda x: " ".join([chat_words_replacements[t] if t in chat_words_replacements else t for t in x.split(" ")]))

# Remmvoing Html Tags if present
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

reviews['Review'] = reviews['Review'].apply(lambda x: remove_html(x))


# Removing the urls if present
def remove_url(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

reviews['Review'] = reviews['Review'].apply(lambda x: remove_url(x))

# Removing the punctuations
def remove_punctuations(text):
    punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]')
    return punc.sub(r' ', text)

reviews['Review'] = reviews['Review'].apply(lambda x: remove_punctuations(x))

# Removing the spaces and replacing them with single space
def remove_spaces(text):
    spaces = re.compile(r'\s+')
    return spaces.sub(r' ', text)

reviews['Review'] = reviews['Review'].apply(lambda x: remove_spaces(x))

# Striping the text
reviews['Review'] = reviews['Review'].apply(lambda x: x.strip())

In [5]:
reviews['Review'][60]

'my gate baggage was missing at delhi airport at the last minutes of boarding airline staff asked me to take off the power bank from my gate baggage and i did but when i got to bangkok my bag was not on flight airport staff did not put my bag in my flight on time and today is the 5th day i did not get any information and the bag yet'

In [6]:
# Lemmatizing the text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
reviews['Review'] = reviews['Review'].apply(lambda x: lemmatize_text(x))

In [7]:
# Removing the stop words
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

In [8]:
reviews['Review'][60]

'gate baggage missing delhi airport minute boarding airline staff asked power bank gate baggage got bangkok bag not flight airport staff not bag flight time today 5th day not information bag'

In [23]:
reviews['Review'].head()

0    flight punctual no comfortable seating crew no...
1    sister niece mother traveling indigo flight bo...
2    77 year old father flying indigo abu dhabi koc...
3    indigo low cost airline prone delay flight lef...
4    flight 6e 1176 colombo chennai chennai mumbai ...
Name: Review, dtype: object

In [9]:
# Tokenizing the each review using word_tokenize
corpus = []
for i in range(0, len(reviews)):
    review = word_tokenize(reviews['Review'][i])
    review = ' '.join(review)
    corpus.append(review)

In [10]:
corpus = [word_tokenize(x)for x in corpus]
corpus

[['flight',
  'punctual',
  'no',
  'comfortable',
  'seating',
  'crew',
  'not',
  'care',
  'passenger',
  'not',
  'book',
  'food',
  'crew',
  'not',
  'ask',
  'passenger',
  'need',
  'food',
  'not',
  'alcohol',
  'gave',
  'people',
  'price'],
 ['sister',
  'niece',
  'mother',
  'traveling',
  'indigo',
  'flight',
  'booked',
  'wheelchair',
  'assistance',
  'mentioned',
  'wheelchair',
  'assistance',
  'waiting',
  'list',
  'reached',
  'airport',
  'mentioned',
  'no',
  'wheel',
  'chair',
  'assistance',
  'provided',
  'reached',
  'airport',
  'close',
  '3',
  'hr',
  'advance',
  'desk',
  'not',
  'help',
  'paid',
  'assistance',
  'wheelchair',
  'simply',
  'ignored',
  'sister',
  "'s",
  'request',
  'departure',
  'gate',
  '85',
  'later',
  'changed',
  'gate',
  '35',
  'minute',
  'mother',
  'walk',
  'long',
  'distance',
  'assistance',
  'airline',
  'crew',
  'crew',
  'guided',
  'family',
  'paid',
  'wheel',
  'chair',
  'assistance',
  'mumb

In [11]:
gensim_model = Word2Vec(min_count=2, vector_size=100, window=7)

gensim_model.build_vocab(corpus)

gensim_model.train(corpus, total_examples=gensim_model.corpus_count, epochs=gensim_model.epochs)


(4239208, 4875730)

In [12]:

gensim_model.wv.most_similar('flight')

[('trip', 0.5328373908996582),
 ('5hours', 0.5267492532730103),
 ('journey', 0.5164800882339478),
 ('connection', 0.46310508251190186),
 ('departure', 0.4398360550403595),
 ('late', 0.43921855092048645),
 ('graciousness', 0.4387531876564026),
 ('schedule', 0.4352688491344452),
 ('sector', 0.43004095554351807),
 ('nearly', 0.42749565839767456)]

In [13]:

gensim_model.wv.most_similar('seat')

[('seating', 0.7733980417251587),
 ('row', 0.7200324535369873),
 ('fully', 0.717946469783783),
 ('window', 0.697523295879364),
 ('aisle', 0.6939951777458191),
 ('position', 0.6799271702766418),
 ('recline', 0.6787154078483582),
 ('twist', 0.6768952012062073),
 ('sit', 0.6616778373718262),
 ('exit', 0.6564600467681885)]

In [14]:

gensim_model.wv.most_similar('legroom')

[('pitch', 0.9352194666862488),
 ('width', 0.8676968812942505),
 ('space', 0.8551918268203735),
 ('wider', 0.8215950727462769),
 ('recline', 0.8034833073616028),
 ('seating', 0.7989224791526794),
 ('cramped', 0.7936792373657227),
 ('comfy', 0.7928248643875122),
 ('spacious', 0.7904322147369385),
 ('slightly', 0.7892315983772278)]

In [15]:

gensim_model.wv.most_similar('turbulence')

[('hit', 0.8607833385467529),
 ('loud', 0.8492173552513123),
 ('seatbelt', 0.8430430293083191),
 ('fasten', 0.8312962651252747),
 ('turned', 0.8232204914093018),
 ('illuminated', 0.8207414746284485),
 ('captain', 0.8131924867630005),
 ('pilot', 0.8126569986343384),
 ('appeared', 0.8003777861595154),
 ('pressed', 0.7973613739013672)]

In [16]:

gensim_model.wv.most_similar('accommodation')

[('hotel', 0.9020826816558838),
 ('voucher', 0.8746092915534973),
 ('stay', 0.8645928502082825),
 ('transportation', 0.8368961215019226),
 ('pullman', 0.8177871108055115),
 ('accomodation', 0.8144041299819946),
 ('coupon', 0.7773797512054443),
 ('compensation', 0.7630071043968201),
 ('motel', 0.7506309151649475),
 ('transport', 0.7465657591819763)]

In [17]:
gensim_model.wv.get_normed_vectors().shape

(12785, 100)

In [18]:
y = gensim_model.wv.index_to_key
y

['flight',
 'not',
 'seat',
 'service',
 'time',
 'airline',
 'food',
 'no',
 'good',
 'staff',
 'hour',
 'crew',
 'cabin',
 'class',
 'meal',
 'check',
 'airport',
 'business',
 'passenger',
 'air',
 'experience',
 'plane',
 'emirate',
 'economy',
 '2',
 'entertainment',
 'singapore',
 'china',
 'dubai',
 'comfortable',
 'boarding',
 'leg',
 'great',
 'new',
 'doha',
 'return',
 'fly',
 'aircraft',
 'drink',
 'lounge',
 'customer',
 'ticket',
 '3',
 'qatar',
 '’',
 'like',
 'attendant',
 'minute',
 'told',
 'guangzhou',
 'friendly',
 'got',
 'airway',
 'day',
 'served',
 'outstanding',
 'long',
 'trip',
 'flew',
 'delhi',
 'delayed',
 'nice',
 'better',
 'luggage',
 'asked',
 'way',
 '5',
 'flying',
 '1',
 'took',
 'choice',
 'ground',
 'best',
 'overall',
 'bag',
 'board',
 'arrived',
 '4',
 'india',
 'system',
 'london',
 'offered',
 'delay',
 'gate',
 'booked',
 'option',
 'baggage',
 'inflight',
 'year',
 'a380',
 'beijing',
 "'s",
 'said',
 'travel',
 'movie',
 'old',
 'use',
 'i

In [19]:
from sklearn.decomposition import PCA

In [20]:
pca = PCA(n_components=3)
result = pca.fit_transform(gensim_model.wv.get_normed_vectors())

In [24]:
gensim_model.wv.most_similar('passenger')

[('people', 0.6886342167854309),
 ('walking', 0.6623228192329407),
 ('curtain', 0.6542983055114746),
 ('pushing', 0.6531941294670105),
 ('pax', 0.649054765701294),
 ('fellow', 0.6413897275924683),
 ('chatting', 0.6413440108299255),
 ('standing', 0.6336990594863892),
 ('stood', 0.6326651573181152),
 ('talking', 0.6248508095741272)]

In [26]:
gensim_model.wv.most_similar('airhostess')

[('demonstrated', 0.9038777351379395),
 ('disrespectful', 0.8987446427345276),
 ('humble', 0.8955747485160828),
 ('impatient', 0.8928471803665161),
 ('defend', 0.8916778564453125),
 ('dismissive', 0.8907102346420288),
 ('shrug', 0.8906366229057312),
 ('reluctance', 0.8905622363090515),
 ('hearing', 0.8900312781333923),
 ('unruly', 0.8879740238189697)]

In [27]:
gensim_model.wv.most_similar('pilot')

[('captain', 0.9366663098335266),
 ('announcement', 0.8739389777183533),
 ('turbulence', 0.8126568794250488),
 ('cockpit', 0.7789049744606018),
 ('kept', 0.7744436264038086),
 ('announced', 0.7730913162231445),
 ('appeared', 0.7418783903121948),
 ('seatbelt', 0.7316807508468628),
 ('tannoy', 0.7272305488586426),
 ('takeoff', 0.7209706902503967)]