
# Importing basic libraries for data analysis

In [None]:

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from mappings import contraction_mapping, chat_words_replacements,airport_codes
from SW import stopwords_airline, negative_words, stopwords_extra
sns.set_theme(style="whitegrid")
import spacy
nlp = spacy.load('en_core_web_lg')

import datetime as dt
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [None]:
# Reading the data from the csv file
df = pd.read_csv('airline_df.csv')
df.head()

# Data Description

In [None]:
# Description of the data
df.describe(include='all')

In [None]:
# Info of the data
df.info()

In [None]:
# Checking the null values in the data
df.isnull().sum().sort_values(ascending=False)

# Stopwords Collection Making

In [None]:
# Joining the stopwords to make a big stopwords collection

stopwords = set(stopwords_airline).union(set(negative_words)).union(set(stopwords_extra))

# For sentiment analysis, we will need negative words as well. So, we have to remove the negative words from the stopwords list

stopwords = set([word for word in stopwords if word not in negative_words])

# Data Cleaning and Preprocessing

* ## Data Imputation

In [None]:
# Separating the numerical and categorical columns
num_col = df.select_dtypes(include=np.number).columns.tolist()
obj_col = df.select_dtypes(include='object').columns.tolist()
cat_col = [x for x in obj_col if df[x].nunique() < 21]

print('Numerical Columns: ',num_col)
print('Categorical Columns: ',cat_col)
print('Object Columns: ',obj_col)

In [None]:
# Imputing the missing values in the numerical columns with mode using simple imputer
from sklearn.impute import KNNImputer

def cleanse_dataframe(df):    
    
    imputer = KNNImputer(n_neighbors=7)
    df[num_col] = imputer.fit_transform(df[num_col])

    # As well as changing the data type of the numerical columns to int8
    
    df[num_col] = df[num_col].astype('int8')
    
    # Changing the Date_Published column to datetime format

    df['Date_Published'] = pd.to_datetime(df['Date_Published'])
    
    # Sort the data by Airline and Date_Published so that we can fill the missing values in the categorical columns

    df.sort_values(by=['Airline','Date_Published'],inplace=True)
    
    # Imputing the missing values in the categorical columns with mode using Backward fill
    
    df[obj_col] = df[obj_col].fillna(method='bfill')

    # Mapping Recommended to 1 and Not Recommended to 0
    
    df['Recommended'] = df['Recommended'].replace({'yes':1,'no':0})

    # As well as changing the data type of the categorical columns to category
    
    df[cat_col] = df[cat_col].astype('category')
    
    # lastly dropping the duplicate records
    
    df.drop_duplicates(inplace=True)

    # Preprocessing the Airline column

    df['Airline'] = df['Airline'].apply(lambda x: re.sub(r'-',' ',x))
    df['Airline'] = df['Airline'].str.title()

    # df.drop(['Route'],axis=1,inplace=True)
    
    df.reset_index(drop=True,inplace=True)
    
    df['Sentiment'] = df['Recommended'].map({1:'Positive',0:'Negative'})
    
    # Capitalizing the first letter of the Origin and Destination columns
    
    df.drop(['Route'],axis=1,inplace=True)
    
    return df

df = cleanse_dataframe(df)

In [None]:
df.head()

* ## Text Preprocessing

In [None]:

import re

def text_preprocess(text):
    
    # Substitute - with 'to'
    
    text = re.sub(r'-',' to ',text)
    
    # Remove punctuation
    punc = re.compile(r'["#$%&()*+,/:;<=>?@[\]^_`{|}~]')
    text = punc.sub(r' ', text)
    
    # Map Airport Codes to Country Names
    text = " ".join([airport_codes[t] if t in airport_codes else t for t in text.split(" ")])
    
    # Convert to lowercase
    text = text.lower()

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002500-\U00002BEF"  # chinese characters
        "\U00002702-\U000027B0"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub(r"", text)

    # Split on the basis of '|'
    text = text.split('|')

    # Check length and take the appropriate part
    if len(text) == 2:
        text = text[-1]
    else:
        text = text[0]

    # Remove special characters: newlines, tabs, etc.
    text = re.sub(r'\n|\t|\r', '', text)

    # Map contractions to expansions
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])

    # Map chat words to formal words
    text = " ".join([chat_words_replacements[t] if t in chat_words_replacements else t for t in text.split(" ")])
    
    # Remove HTML tags
    html_pattern = re.compile('<.*?>')
    text = html_pattern.sub(r'', text)

    # Remove URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)

    # Remove punctuation
    punc = re.compile(r'["#$%&()*+,/:;<=>?@[\]^_`{|}~]')
    text = punc.sub(r' ', text)
    
    # Remove extra spaces
    spaces = re.compile(r'\s+')
    text = spaces.sub(r' ', text)

    # Strip leading/trailing spaces
    text = text.strip()

    return text

df['Review'] = df['Review'].apply(lambda x: text_preprocess(x))

In [None]:
# Creating a new column for the length of the review

df['Review_Length'] = df['Review'].apply(lambda x: len(x.split()))

print('Maximum Review Length: ',df['Review_Length'].max())
print('Minimum Review Length: ',df['Review_Length'].min())
print('Average Review Length: ',df['Review_Length'].mean())

In [None]:

# Lemmatizing the words in each review using Spacy

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token not in stopwords])

df['Cleaned_Review'] = df['Review'].apply(lemmatize_text)


In [None]:
# Creating a new column for the length of the review

df['Cleaned_Review_Length'] = df['Cleaned_Review'].apply(lambda x: len(x.split()))

print('Maximum Review Length: ',df['Cleaned_Review_Length'].max())
print('Minimum Review Length: ',df['Cleaned_Review_Length'].min())
print('Average Review Length: ',df['Cleaned_Review_Length'].mean())

In [None]:
df.head()

# Now Plotting the distribution of the numerical columns after imputing the missing values

fig , ax = plt.subplots(3,2,figsize=(15,15))

for i, subplot in zip(num_col, ax.flatten()):
    sns.distplot(df[i], ax=subplot)
plt.show()

# Plotting the distribution of the categorical columns after imputing the missing values

fig , ax = plt.subplots(3,2,figsize=(20,15))

for i, subplot in zip(cat_col, ax.flatten()):
    sns.countplot(x=i,data=df, ax=subplot, palette='CMRmap_r')
    if i in ['Airline','Country']:
        for label in subplot.get_xticklabels():
            label.set_rotation(90)
plt.show()

In [None]:
df.to_csv('airline_df_cleaned.csv',index=False)