
# Importing basic libraries for data analysis

In [1]:

import pandas as pd 
import numpy as np
import re
from mappings import contraction_mapping, chat_words_replacements,airport_codes
from SW import stopwords_airline, negative_words, stopwords_extra
import spacy
nlp = spacy.load('en_core_web_lg')
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [2]:
# Reading the data from the csv file
df = pd.read_csv('airline_df.csv')
df.head()

Unnamed: 0,Airline,Country,Review,Date_Published,Type of Traveller,Seat Type,Route,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Value for Money,Recommended
0,indigo-airlines,India,✅ Trip Verified | Flight was punctual. But no ...,2023-05-10,Solo Leisure,Economy Class,Abu Dhabi to Kochi,1.0,1.0,1.0,,3.0,1.0,no
1,indigo-airlines,India,"✅ Trip Verified | My sister, niece and mother...",2023-05-07,Family Leisure,Economy Class,Mumbai to Mangalore,1.0,,,,1.0,1.0,no
2,indigo-airlines,India,✅ Trip Verified | My 77-year-old father was fl...,2023-04-28,Solo Leisure,Economy Class,Abu Dhabi to Kochi,2.0,2.0,,,2.0,4.0,no
3,indigo-airlines,India,Not Verified | IndiGo are a low cost airline ...,2023-04-24,Solo Leisure,Economy Class,Jaipur to Ahmedabad,2.0,2.0,,,2.0,3.0,yes
4,indigo-airlines,India,✅ Trip Verified | My flight 6e 1176 which was...,2023-04-22,Family Leisure,Economy Class,Colombo to Mumbai via Chennai,1.0,1.0,1.0,1.0,1.0,1.0,no


# Data Description

In [3]:
# Info of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15220 entries, 0 to 15219
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Airline                 15220 non-null  object 
 1   Country                 15220 non-null  object 
 2   Review                  15220 non-null  object 
 3   Date_Published          15220 non-null  object 
 4   Type of Traveller       10974 non-null  object 
 5   Seat Type               15047 non-null  object 
 6   Route                   10962 non-null  object 
 7   Seat Comfort            14222 non-null  float64
 8   Cabin Staff Service     14208 non-null  float64
 9   Food & Beverages        13085 non-null  float64
 10  Inflight Entertainment  11762 non-null  float64
 11  Ground Service          10720 non-null  float64
 12  Value for Money         15044 non-null  float64
 13  Recommended             15220 non-null  object 
dtypes: float64(6), object(8)
memory usage:

In [4]:
# Checking the null values in the data
df.isnull().sum().sort_values(ascending=False)

Ground Service            4500
Route                     4258
Type of Traveller         4246
Inflight Entertainment    3458
Food & Beverages          2135
Cabin Staff Service       1012
Seat Comfort               998
Value for Money            176
Seat Type                  173
Airline                      0
Country                      0
Review                       0
Date_Published               0
Recommended                  0
dtype: int64

# Stopwords Collection Making

In [5]:
# Joining the stopwords to make a big stopwords collection

stopwords = set(stopwords_airline).union(set(stopwords_extra)).union(nlp.Defaults.stop_words)

# For sentiment analysis, we will need negative words as well. So, we have to remove the negative words from the stopwords list

stopwords = set([word for word in stopwords if word not in negative_words])

print(len(stopwords))

795


# Data Cleaning and Preprocessing

* ## Data Imputation

In [6]:
# Separating the numerical and categorical columns
num_col = df.select_dtypes(include=np.number).columns.tolist()
obj_col = df.select_dtypes(include='object').columns.tolist()
cat_col = [x for x in obj_col if df[x].nunique() < 21]

print('Numerical Columns: ',num_col)
print('Categorical Columns: ',cat_col)
print('Object Columns: ',obj_col)

Numerical Columns:  ['Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Inflight Entertainment', 'Ground Service', 'Value for Money']
Categorical Columns:  ['Airline', 'Country', 'Type of Traveller', 'Seat Type', 'Recommended']
Object Columns:  ['Airline', 'Country', 'Review', 'Date_Published', 'Type of Traveller', 'Seat Type', 'Route', 'Recommended']


# DataFrame Cleaning

In [7]:
def cleanse_dataframe(df):
    
    imputer = KNNImputer(n_neighbors=7)
    df[num_col] = imputer.fit_transform(df[num_col])

    # As well as changing the data type of the numerical columns to int8
    
    df[num_col] = df[num_col].astype('int8')
    
    # Changing the Date_Published column to datetime format

    df['Date_Published'] = pd.to_datetime(df['Date_Published'])
    
    # Sort the data by Airline and Date_Published so that we can fill the missing values in the categorical columns

    df.sort_values(by=['Airline','Date_Published'],inplace=True)
    
    # Imputing the missing values in the categorical columns with mode using Backward fill
    
    df[obj_col] = df[obj_col].fillna(method='bfill')

    # Mapping Recommended to 1 and Not Recommended to 0
    
    df['Recommended'] = df['Recommended'].replace({'yes':1,'no':0})

    # As well as changing the data type of the categorical columns to category
    
    df[cat_col] = df[cat_col].astype('category')
    
    # lastly dropping the duplicate records
    
    df.drop_duplicates(inplace=True)

    # Preprocessing the Airline column

    df['Airline'] = df['Airline'].apply(lambda x: re.sub(r'-',' ',x))
    df['Airline'] = df['Airline'].str.title()
    
    # Creating a new column for Overall Rating
    
    df['Overall_Rating'] = df[num_col].mean(axis=1).round(2)
    
    # Creating a new column for Sentiment where 1 is positive and 0 is negative sentiment
    
    df['Sentiment'] = df['Overall_Rating'].apply(lambda x: 1 if x > 3 else (0 if x < 3 else 2))
    
    # Dropping the records having neutral sentiment
    
    df = df[df['Sentiment'] != 2]
    
    # Resetting the index and dropping the Route column
    
    df.drop(['Route'],axis=1,inplace=True)    
    df.reset_index(drop=True,inplace=True)
        
    return df

df = cleanse_dataframe(df)

* ## Text Preprocessing

In [8]:

import re

def text_preprocess(text):
    
    # Split on the basis of '|'
    text = text.split('|')

    # Check length and take the appropriate part
    if len(text) == 2:
        text = text[-1]
    else:
        text = text[0]
        
    # Remove punctuation
    punc = re.compile(r'-')
    text = punc.sub(r' ', text)
    
    # Remove brackets
    brackets = re.compile(r'[()[\]{}!@#^&*]')
    text = brackets.sub(r'', text)
    
    # Map Airport Codes to Country Names
    text = " ".join([airport_codes[t] if t in airport_codes else t for t in text.split(" ")])
    
    # Convert to lowercase
    text = text.lower()

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002500-\U00002BEF"  # chinese characters
        "\U00002702-\U000027B0"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub(r"", text)

    # Remove special characters: newlines, tabs, etc.
    text = re.sub(r'\n|\t|\r', '', text)

    # Map contractions to expansions
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])

    # Map chat words to formal words
    text = " ".join([chat_words_replacements[t] if t in chat_words_replacements else t for t in text.split(" ")])
    
    # Remove HTML tags
    html_pattern = re.compile('<.*?>')
    text = html_pattern.sub(r'', text)

    # Remove URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)
    
    # Remove extra spaces
    spaces = re.compile(r'\s+')
    text = spaces.sub(r' ', text)

    # Strip leading/trailing spaces
    text = text.strip()
    
    # Convert to lowercase
    text = text.lower()

    return text

df['Review'] = df['Review'].apply(text_preprocess)

# Flair Sentiment Analysis

from flair.models import TextClassifier
from flair.data import Sentence

classifier = TextClassifier.load('sentiment-fast')

def flair_sentiment(text):
    sentence = Sentence(text)
    classifier.predict(sentence)
    return sentence.labels[0].value

df['Sentiment_Flair'] = df['Review'].apply(flair_sentiment)

df['Sentiment_Flair'] = df['Sentiment_Flair'].apply(lambda x: 1 if x == 'POSITIVE' else 0)

In [9]:
# Creating a new column for the length of the review

df['Review_Length'] = df['Review'].apply(lambda x: len(x.split()))

print('Maximum Review Length: ',df['Review_Length'].max())
print('Minimum Review Length: ',df['Review_Length'].min())
print('Average Review Length: ',df['Review_Length'].mean())

Maximum Review Length:  686
Minimum Review Length:  14
Average Review Length:  129.50051268029256


In [10]:
# Lemmatizing the reviews using spacy

def lemmatizer(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

df['Cleaned_Review'] = df['Review'].apply(lemmatizer)
# Remvoing the stopwords from the reviews

df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

In [11]:
# Creating a new column for the length of the review

df['Cleaned_Review_Length'] = df['Cleaned_Review'].apply(lambda x: len(x.split()))

print('Maximum Review Length: ',df['Cleaned_Review_Length'].max())
print('Minimum Review Length: ',df['Cleaned_Review_Length'].min())
print('Average Review Length: ',df['Cleaned_Review_Length'].mean())

Maximum Review Length:  385
Minimum Review Length:  8
Average Review Length:  67.54200560530454


In [12]:
df.to_csv('airline_cleaned.csv',index=False)

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

print('Accuracy Score: ',accuracy_score(df['Sentiment'],df['Sentiment_Flair']))
print('ROC AUC Score: ',roc_auc_score(df['Sentiment'],df['Sentiment_Flair']))
print('Confusion Matrix: \n',confusion_matrix(df['Sentiment'],df['Sentiment_Flair']))
print('Classification Report: \n',classification_report(df['Sentiment'],df['Sentiment_Flair']))

print('Accuracy Score: ',accuracy_score(df['Sentiment'],df['Sentiment_Vader']))
print('ROC AUC Score: ',roc_auc_score(df['Sentiment'],df['Sentiment_Vader']))
print('Confusion Matrix: \n',confusion_matrix(df['Sentiment'],df['Sentiment_Vader']))
print('Classification Report: \n',classification_report(df['Sentiment'],df['Sentiment_Vader']))

print('Accuracy Score: ',accuracy_score(df['Sentiment'],df['Sentiment_TextBlob']))
print('ROC AUC Score: ',roc_auc_score(df['Sentiment'],df['Sentiment_TextBlob']))
print('Confusion Matrix: \n',confusion_matrix(df['Sentiment'],df['Sentiment_TextBlob']))
print('Classification Report: \n',classification_report(df['Sentiment'],df['Sentiment_TextBlob']))