In [171]:
# Load dependencies
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

from ast import literal_eval

import plotly
import plotly.express as px

# Set sentiment extensions
sent_analyzer = SentimentIntensityAnalyzer()
def sentiment_scores(docx):
    return sent_analyzer.polarity_scores(docx.text)
Doc.set_extension("sentimenter",getter=sentiment_scores,force=True)

In [166]:
# Load files
df = pd.read_csv('data_airlinequality.csv', index_col=0)

In [90]:
df.columns

Index(['Aircraft', 'Cabin Staff Service', 'Date Flown', 'Food & Beverages',
       'Ground Service', 'Inflight Entertainment', 'Recommended', 'Route',
       'Seat Comfort', 'Seat Type', 'Type Of Traveller', 'Unnamed: 0.1',
       'Value For Money', 'Wifi & Connectivity', 'company_name', 'country',
       'date', 'global Food & Beverages', 'global Inflight Entertainment',
       'global Seat Comfort', 'global Staff Service', 'global Value for Money',
       'review', 'title', 'verification'],
      dtype='object')

# What contribute to customer satisfaction?

# Input:

## Trip type:
* Flight Length: Short term flight <1000km; long term flight > 4000km; middle term flight between 1000 and 4000 km
* Seat type: 'Economy Class', 'Business Class', 'Premium Economy', 'First Class'
* Type Of Traveller: 'Solo Leisure', 'Business', 'Family Leisure', 'Couple Leisure'
* Whether there is a stop or not

## Personal-wise ratings:
* Seat Comfort: 1-5
* Cabin Staff Service: 1-5
* Ground Service: 1-5
* Value For Money: 1-5

## Plane-wise ratings:
* Food & Beverages: 1-5
* Inflight Entertainment: 1-5

# Output:
* Recommended: True or False

# The full model

In [469]:
df2 = df

In [470]:
# Some helper functions

# Parse From, To, Stop
def parse_route(df):
    def extract_from_a_list(li,i):
        try:
            return li[i]
        except:
            return None
        
    df_temp = df['Route'].str.split(' to | via ')
    df['From'] = df_temp.map(lambda x:extract_from_a_list(x,0))
    df['To'] = df_temp.map(lambda x:extract_from_a_list(x,1))
    df['Stop'] = df_temp.map(lambda x:extract_from_a_list(x,2))
    return df

cities = pd.read_csv('cities.csv', index_col='cities')

# Calculate distances 
def dis(c):
    try:
        lo_from = literal_eval(cities.loc[c[0], 'geocode'])
        lo_to = literal_eval(cities.loc[c[1], 'geocode'])
        return distance.distance(lo_from, lo_to).kilometers
    except:
        return None

# Categorize flights by distance
def cat_dis(d):
    if d<1000:
        return 'Short-Haul'
    elif d<4000:
        return 'Middle-Haul'
    else:
        return 'Long-Haul'

# A helper function to corece a column to a scale of 1-5
def corecer(text):
    try:
        if float(text) <=5 and float(text)>=1:
            return float(text)
    except:
        pass

In [471]:
# Calculate distances
df2 = parse_route(df2)
df2['Distance'] = df2[['From','To']].apply(dis, axis=1)

# Categorize flights by distance
df2['Flight Length'] = df2['Distance'].map(cat_dis, 'ignore')

# Make a new column - Is_stop
df2['Is_stop'] = df2['Stop'].notna() * 1

In [472]:
# Prepare the dataframe
cols = ['Seat Comfort', 'Cabin Staff Service', 'Ground Service', 
        'Value For Money', 'Food & Beverages', 'Inflight Entertainment',
        'Distance', 'Is_stop', 'Seat Type', 'Type Of Traveller', 'Flight Length']

df2 = df2[cols + ['Recommended']]

In [473]:
# Corece each rating column to a scale of 1-5
cols_15 = ['Seat Comfort', 'Cabin Staff Service', 'Ground Service', 
            'Value For Money', 'Food & Beverages', 'Inflight Entertainment']
for c in cols_15:
    df2[c] = df2[c].map(corecer)
    
# Convert the output column to have only 1-2
df2['Recommended'] = df2['Recommended'].map({'yes':1, 'no':0})

In [467]:
df2.to_csv('dashboard_airlinequality.csv', index=None)

In [308]:
# Separate our data set (that's where we are going to make a dashboard)

In [474]:
li_seat_type = ['Economy Class', 'Business Class', 'Premium Economy', 'First Class']
li_type_of_traveller = ['Solo Leisure', 'Business', 'Family Leisure', 'Couple Leisure']
li_flight_length = ['Short-Haul', 'Middle-Haul', 'Long-Haul']

In [476]:
seat_type = ['Economy Class']
type_of_travaller = ['Solo Leisure']
stop = [False]
flight_length = ['Short-Haul']

mask = (df2['Seat Type'].isin(seat_type)) & (df2['Type Of Traveller'].isin(type_of_travaller)) & (df2['Is_stop'].isin(stop)) & (df2['Flight Length'].isin(flight_length))
df3 = df2[mask]

# Fill nans with mean
df3 = df3.fillna(df3.mean())
            
# Make X and y
X = df3[cols_15]
y = df3['Recommended']

In [477]:
# Cross Validation
from sklearn import linear_model
from sklearn.model_selection import cross_validate

model = linear_model.LogisticRegression(solver='lbfgs', C=0.05)

print(cross_validate(model, X, y, cv=5, return_train_score=True))

{'fit_time': array([0.01099563, 0.00699663, 0.00899911, 0.00699592, 0.01099348]), 'score_time': array([0.00099945, 0.00199866, 0.00099516, 0.00099969, 0.0019989 ]), 'test_score': array([0.92592593, 0.92592593, 0.92592593, 0.94776119, 0.90298507]), 'train_score': array([0.93866171, 0.93866171, 0.93494424, 0.94434137, 0.94990724])}


In [478]:
# Calculate coefficients
model.fit(X, y)
df_coef = pd.DataFrame({'Feature':X.columns.to_list(),
                        'Coef':model.coef_.tolist()[0]})
df_coef = df_coef.sort_values(['Coef'])

# Calculate some statistics
model_accuracy = cross_validate(model, X, y, cv=5, return_train_score=True)['test_score'].mean()
n_samples = X.shape[0]
print(model_accuracy, n_samples)

0.9257048092868988 673


In [479]:
# Visualize coefficients
# Intepretation: If the coefficient for Inflight Entertainment is 0.2,
#                an increase in a star in Inflight Entertainment will make a customer 22% (exp(0.2)=1.22) 
#                more likely to recommend the flight 
fig = px.bar(df_coef, x='Feature', y='Coef', color='Feature')
fig.show()