
# Importing basic libraries for data analysis

In [1]:

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from mappings import contraction_mapping, chat_words_replacements,airport_codes
sns.set_theme()

# Setting the best style for the plots in seaborn

import datetime as dt
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [2]:
# Reading the data from the csv file
df = pd.read_csv('airline_df.csv')
df.head()

Unnamed: 0,Airline,Country,Review,Date_Published,Type of Traveller,Seat Type,Route,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Value for Money,Recommended
0,indigo-airlines,India,✅ Trip Verified | Flight was punctual. But no ...,2023-05-10,Solo Leisure,Economy Class,Abu Dhabi to Kochi,1.0,1.0,1.0,,3.0,1.0,no
1,indigo-airlines,India,"✅ Trip Verified | My sister, niece and mother...",2023-05-07,Family Leisure,Economy Class,Mumbai to Mangalore,1.0,,,,1.0,1.0,no
2,indigo-airlines,India,✅ Trip Verified | My 77-year-old father was fl...,2023-04-28,Solo Leisure,Economy Class,Abu Dhabi to Kochi,2.0,2.0,,,2.0,4.0,no
3,indigo-airlines,India,Not Verified | IndiGo are a low cost airline ...,2023-04-24,Solo Leisure,Economy Class,Jaipur to Ahmedabad,2.0,2.0,,,2.0,3.0,yes
4,indigo-airlines,India,✅ Trip Verified | My flight 6e 1176 which was...,2023-04-22,Family Leisure,Economy Class,Colombo to Mumbai via Chennai,1.0,1.0,1.0,1.0,1.0,1.0,no


# Data Description

In [3]:
# Description of the data
df.describe(include='all')

Unnamed: 0,Airline,Country,Review,Date_Published,Type of Traveller,Seat Type,Route,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Value for Money,Recommended
count,15220,15220,15220,15220,10974,15047,10962,14222.0,14208.0,13085.0,11762.0,10720.0,15044.0,15220
unique,19,9,15205,3718,4,4,6716,,,,,,,2
top,emirates,India,Manchester to Doha and then Bangkok 24th Janua...,2015-01-14,Solo Leisure,Economy Class,Guangzhou to Sydney,,,,,,,yes
freq,2266,3640,2,39,4534,11214,30,,,,,,,8913
mean,,,,,,,,3.425116,3.588401,3.343676,3.432409,3.213433,3.372108,
std,,,,,,,,1.390692,1.524603,1.439589,1.408876,1.637206,1.533749,
min,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,
25%,,,,,,,,2.0,2.0,2.0,2.0,1.0,2.0,
50%,,,,,,,,4.0,4.0,4.0,4.0,4.0,4.0,
75%,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,


In [4]:
# Info of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15220 entries, 0 to 15219
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Airline                 15220 non-null  object 
 1   Country                 15220 non-null  object 
 2   Review                  15220 non-null  object 
 3   Date_Published          15220 non-null  object 
 4   Type of Traveller       10974 non-null  object 
 5   Seat Type               15047 non-null  object 
 6   Route                   10962 non-null  object 
 7   Seat Comfort            14222 non-null  float64
 8   Cabin Staff Service     14208 non-null  float64
 9   Food & Beverages        13085 non-null  float64
 10  Inflight Entertainment  11762 non-null  float64
 11  Ground Service          10720 non-null  float64
 12  Value for Money         15044 non-null  float64
 13  Recommended             15220 non-null  object 
dtypes: float64(6), object(8)
memory usage:

In [5]:
# Checking the null values in the data
df.isnull().sum().sort_values(ascending=False)

Ground Service            4500
Route                     4258
Type of Traveller         4246
Inflight Entertainment    3458
Food & Beverages          2135
Cabin Staff Service       1012
Seat Comfort               998
Value for Money            176
Seat Type                  173
Airline                      0
Country                      0
Review                       0
Date_Published               0
Recommended                  0
dtype: int64

# Data Cleaning and Preprocessing

* ## Data Imputation

In [6]:
# Separating the numerical and categorical columns
num_col = df.select_dtypes(include=np.number).columns.tolist()
obj_col = df.select_dtypes(include='object').columns.tolist()
cat_col = [x for x in obj_col if df[x].nunique() < 21]

print('Numerical Columns: ',num_col)
print('Categorical Columns: ',cat_col)

Numerical Columns:  ['Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Inflight Entertainment', 'Ground Service', 'Value for Money']
Categorical Columns:  ['Airline', 'Country', 'Type of Traveller', 'Seat Type', 'Recommended']


In [7]:
# Imputing the missing values in the numerical columns with mode using simple imputer
from sklearn.impute import KNNImputer

def cleanse_data(df):    
    
    imputer = KNNImputer(n_neighbors=7)
    df[num_col] = imputer.fit_transform(df[num_col])

    # As well as changing the data type of the numerical columns to int8
    
    df[num_col] = df[num_col].astype('int8')
    
    # Changing the Date_Published column to datetime format

    df['Date_Published'] = pd.to_datetime(df['Date_Published'])
    
    # Sort the data by Airline and Date_Published so that we can fill the missing values in the categorical columns

    df.sort_values(by=['Airline','Date_Published'],inplace=True)
    
    # Imputing the missing values in the categorical columns with mode using Backward fill
    
    df[obj_col] = df[obj_col].fillna(method='bfill')

    # Mapping Recommended to 1 and Not Recommended to 0
    
    df['Recommended'] = df['Recommended'].map({'yes':1,'no':0})

    # As well as changing the data type of the categorical columns to category
    
    df[cat_col] = df[cat_col].astype('category')

    # Cleaning the Route column

    df['Route'] = df['Route'].apply(lambda x: x.split('via')[0].strip() if 'via' in x else x.strip())

    # Dropping the record where the route is Melbourne, Chennai and Zurich
    
    df.drop(df[df['Route'] == 'Melbourne'].index, inplace=True)
    df.drop(df[df['Route'] == 'Chennai'].index, inplace=True)
    df.drop(df[df['Route'] == 'Zurich'].index, inplace=True)

    # Small casing the Route column

    df['Route'] = df['Route'].str.lower()
    
    # ro, to, yo, - will be used for splitting the route column

    # Splitting the Route column into Origin and Destination

    # Origin Extraction

    df['Origin'] = df['Route'].apply(lambda x: x.split('ro')[0].strip() if 'ro' in x else x.strip())
    df['Origin'] = df['Origin'].apply(lambda x: x.split('to')[0].strip() if 'to' in x else x.strip())
    df['Origin'] = df['Origin'].apply(lambda x: x.split('yo')[0].strip() if 'yo' in x else x.strip())
    df['Origin'] = df['Origin'].apply(lambda x: x.split('-')[0].strip() if '-' in x else x.strip())
    df['Origin'] = df['Origin'].str.capitalize()

    # Destination Extraction

    df['Destination'] = df['Route'].apply(lambda x: x.split('ro')[-1].strip() if 'ro' in x else x.strip())
    df['Destination'] = df['Destination'].apply(lambda x: x.split('to')[-1].strip() if 'to' in x else x.strip())
    df['Destination'] = df['Destination'].apply(lambda x: x.split('yo')[-1].strip() if 'yo' in x else x.strip())
    df['Destination'] = df['Destination'].apply(lambda x: x.split('-')[-1].strip() if '-' in x else x.strip())
    df['Destination'] = df['Destination'].str.capitalize()

    # lastly dropping the
    
    df.drop_duplicates(inplace=True)

    # Preprocessing the Airline column

    df['Airline'] = df['Airline'].apply(lambda x: re.sub(r'-',' ',x))
    df['Airline'] = df['Airline'].str.title()

    df.drop(['Route'],axis=1,inplace=True)
    
    df.reset_index(drop=True,inplace=True)
    
    df['Sentiment'] = df['Recommended'].map({1:'Positive',0:'Negative'})
    
    return df

df = cleanse_data(df)

* ## Text Preprocessing

In [8]:
print(contraction_mapping)

{"ain't": 'is not', "aren't": 'are not', "can't": 'cannot', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'll": 'he will', "he's": 'he is', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd": 'i would', "i'd've": 'i would have', "i'll": 'i will', "i'll've": 'i will have', "i'm": 'i am', "i've": 'i have', "isn't": 'is not', "it'd": 'it would', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't": 'might not', "mightn't've": 'might not have', "must've": 'must have', "mustn't": 'must not', "mustn't've": 'must not have', "ne

In [9]:
import re

def text_preprocess(text):
    
    # Substitute - with 'to'
    
    text = re.sub(r'-',' to ',text)
    
    # Remove punctuation
    punc = re.compile(r'["#$%&()*+,/:;<=>?@[\]^_`{|}~]')
    text = punc.sub(r' ', text)
    
    # Map Airport Codes to Country Names
    text = " ".join([airport_codes[t] if t in airport_codes else t for t in text.split(" ")])
    
    # Convert to lowercase
    text = text.lower()

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002500-\U00002BEF"  # chinese characters
        "\U00002702-\U000027B0"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub(r"", text)

    # Split on the basis of '|'
    text = text.split('|')

    # Check length and take the appropriate part
    if len(text) == 2:
        text = text[-1]
    else:
        text = text[0]

    # Remove special characters: newlines, tabs, etc.
    text = re.sub(r'\n|\t|\r', '', text)

    # Map contractions to expansions
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])

    # Map chat words to formal words
    text = " ".join([chat_words_replacements[t] if t in chat_words_replacements else t for t in text.split(" ")])
    
    # Remove HTML tags
    html_pattern = re.compile('<.*?>')
    text = html_pattern.sub(r'', text)

    # Remove URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)

    # Remove punctuation
    punc = re.compile(r'["#$%&()*+,/:;<=>?@[\]^_`{|}~]')
    text = punc.sub(r' ', text)
    
    # Remove extra spaces
    spaces = re.compile(r'\s+')
    text = spaces.sub(r' ', text)

    # Strip leading/trailing spaces
    text = text.strip()

    return text

df['Review'] = df['Review'].apply(lambda x: text_preprocess(x))

In [10]:
df.head()

Unnamed: 0,Airline,Country,Review,Date_Published,Type of Traveller,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Value for Money,Recommended,Origin,Destination,Sentiment
0,Air China,China,los angeles to beijing return. food low qualit...,2012-01-30,Solo Leisure,Economy Class,3,2,2,1,1,3,0,Beijing,Xi'an,Negative
1,Air China,China,round to trip from hong kong to munich. the ma...,2012-01-31,Solo Leisure,Economy Class,1,3,1,1,1,3,0,Beijing,Xi'an,Negative
2,Air China,China,sydney to beijing to paris then rome to beijin...,2012-02-03,Solo Leisure,Economy Class,2,3,1,2,1,3,0,Beijing,Xi'an,Negative
3,Air China,China,london to sydney return via beijing. a cheap f...,2012-02-22,Solo Leisure,Economy Class,4,1,2,4,3,4,0,Beijing,Xi'an,Negative
4,Air China,China,beijing to shanghai. only one check to in desk...,2012-02-28,Solo Leisure,Economy Class,4,4,3,3,3,4,1,Beijing,Xi'an,Positive


# Now Plotting the distribution of the numerical columns after imputing the missing values

fig , ax = plt.subplots(3,2,figsize=(15,15))

for i, subplot in zip(num_col, ax.flatten()):
    sns.distplot(df[i], ax=subplot)
plt.show()

# Plotting the distribution of the categorical columns after imputing the missing values

fig , ax = plt.subplots(3,2,figsize=(20,15))

for i, subplot in zip(cat_col, ax.flatten()):
    sns.countplot(x=i,data=df, ax=subplot, palette='CMRmap_r')
    if i in ['Airline','Country']:
        for label in subplot.get_xticklabels():
            label.set_rotation(90)
plt.show()

In [16]:
for i in ['Origin','Destination']:
    # Print the unique values in the column which have length less than 4
    print(i,df[df[i].str.len() < 4][i].unique())

Origin ['Lhr' 'Pek' 'Lax' 'S' '' 'Syd' 'Hkg' 'Fra' 'Yul' 'Sfo' 'Jfk' 'Mel' '{vg'
 'Pvg' 'Nrt' 'Bkk' 'Mnl' 'Hel' 'Isb' 'Yvr' 'Yyc' 'Hnd' 'Vie' 'Akl' 'Icn'
 'Can' 'Sgn' 'Ctu' 'Del' 'Dus' 'Bud' 'Iah' 'Ewr' 'Sha' 'Kul' 'Fco' 'Hkt'
 'Cdg' 'Sin' 'Hgh' 'Nkm' 'Cph' 'Uln' 'Bcn' 'Gmp' 'Hrb' 'Hnl' 'Tpe' 'Xiy'
 'Khn' 'Iad' 'New' 'Bom' 'Ord' 'Bhx' 'Lko' 'Rgn' 'Ccu' 'Ruh' 'Maa' 'Hyd'
 'Goi' 'Dxb' 'Udr' 'Bxh' 'Cms' 'Vns' 'Amd' 'Ixb' 'Bho' 'Ktm' 'Ixc' 'Bbi'
 'Mxp' 'Gay' 'Goa' 'Leh' 'Ccj' 'Jai' 'Vtz' 'Blr' 'Sjc' 'Cgk' 'Osa' 'Fuk'
 'Bru' 'Xmn' 'Sea' 'Itm' 'Kix' 'Aus' 'Tak' 'Vce' 'Usa' 'Hi' 'Cmb' 'Szx'
 'Yyz' 'Cgq' 'Pnh' 'Nkg' 'Ams' 'Wuz' 'Oka' 'Dps' 'Prg' 'Ckg' 'Xnn' 'Mru'
 'Per' 'Nng' 'Tna' 'She' 'Bne' 'Lxa' 'Hak' 'Tsn' 'Foc' 'Svo' 'Nbo' 'An'
 'Phn' 'Sjw' 'Het' 'Zuh' 'Cnx' 'Hfe' 'Vyr' 'Cgo' 'Kmg' 'Csx' 'Tao' 'Ngb'
 'Wuh' 'Nny' 'Chc' 'Pqc' 'Wnz' 'Dlc' 'Afo' 'Kwe' 'Nai' 'Cai' 'Nce' 'Bos'
 'Arn' 'Jnb' 'Adl' 'Bah' 'Doh' 'Cpt' 'Nyc' 'Lgw' 'Gva' 'Gla' 'Dfw' 'Mco'
 'Lis' 'Dur' 'Ncl' 'Man' 'Khi' 'Lun' 'Acc' 'M

df.to_csv('airline_df_cleaned.csv',index=False)