<a href="https://colab.research.google.com/github/AliceKitchkin/Unsupervised-Machine-Learning/blob/main/AirlinePassengerSatisfaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Bibliotheken und Daten Import

---

In [1]:
# handle table-like data and matrices
import pandas as pd
import numpy as np

# visualisation
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# preprocessing
from sklearn.preprocessing import StandardScaler

# pca
from sklearn.decomposition import PCA

# clustering
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans, AgglomerativeClustering

# evaluations
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# to display the total number columns present in the dataset
pd.set_option('display.max_columns', None)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: ignored

In [None]:
data_raw = pd.read_csv("/content/drive/MyDrive/Bachelorarbeit/CSV/Airline Passenger Satisfaction/AirlinePassengerData_Test.csv")
data = data_raw.copy()

#2. Aufbereitung

---

##2.1 First Look

In [None]:
data_raw.head(5)

In [None]:
data_raw.info()

In [None]:
data_raw.shape

In [None]:
data_raw.describe(include = 'all')

##2.2 Missing Values

Missingno is a Python library that provides the ability to understand the distribution of missing values through informative visualizations.

In [None]:
print(data_raw.isna().sum())

In [None]:
msno.matrix(data_raw)

Die Spalte "Arrival Delay in Minutes" (Ankunftsverspätung in Minuten) hat 83 fehlende Werte.

In [None]:
#Zeilen mit fehlenden Werten löschen
data = data.dropna()
data.isna().sum()

##2.3 Spaltenaufarbeitung

In [None]:
data['GesamtzufriedenheitSummiert'] = data['Inflight wifi service'] + data['Departure/Arrival time convenient'] + data['Ease of Online booking'] + data['Gate location'] + data['Food and drink'] + data['Online boarding'] + data['Seat comfort'] + data['Inflight entertainment'] + data['On-board service'] + data['Leg room service'] + data['Baggage handling'] + data['Checkin service'] + data['Inflight service'] + data['Cleanliness']

In [None]:
#Neue Spalte "Flugverspatung in Minuten" hinzufügen
#Die im Flug aufgeholte Zeit in Minuten ist positiv
#Die im Flug noch zusätzliche Verspätung ist negativ
data['FlugverspatungInMinuten'] = data['Arrival Delay in Minutes'] - data['Departure Delay in Minutes']

In [None]:
data['IstPuenktlich'] = np.where((data['Arrival Delay in Minutes'] - data['Departure Delay in Minutes'])==0, 1, 0)

In [None]:
#Erste Spalte "Unnamed: 0" löschen, ist bloß ein Zeilenzähler
data = data.drop("Unnamed: 0", axis=1)

In [None]:
data = data.drop("id", axis=1)

In [None]:
data['IstKind'] = np.where(data.Age<13 , 1, 0)

In [None]:
data['IstErwachsener'] = np.where(data.Age>=18 , 1, 0)

In [None]:
data['IstTeenager'] = np.where((data.Age>12) & (data.Age<18), 1, 0)

In [None]:
#umbenennen
data = data.rename({"id":"id", "Gender":"Geschlecht", "Customer Type":"PassagierTyp", "Age":"Alter", "Type of Travel":"Reisetyp", "Class":"Klasse", "Flight Distance":"Flugdistanz", "Inflight wifi service":"InternetAufFlug", "Departure/Arrival time convenient":"Abreisezeit_Ankunftszeit_Bequemlichkeit", "Ease of Online booking":"LeichtigkeitDerOnlineBuchung", "Gate location":"StandortGate", "Food and drink":"EssenTrinken", "Online boarding":"OnlineBoarding", "Seat comfort":"Sitzkomfort", "Inflight entertainment":"Flugentertainment", "On-board service":"OnboradingService", "Leg room service":"Beinfreiheit", "Baggage handling":"Gepaeckumgang", "Checkin service":"CheckinService", "Inflight service":"Flugservice", "Cleanliness":"Sauberkeit", "Departure Delay in Minutes":"AbflugverspaetungInMinuten", "Arrival Delay in Minutes":"GesamtverspaetungInMinuten", "satisfaction":"IstZufrieden"}, axis=1)

In [None]:
data.head()

##2.4 Check for Unique Values

In [None]:
for col in data:
    print(data[col].unique())

##2.5 Ausreißer erkunden

In [None]:
#Datensatz kopieren um nach und nach Ausreißer zu entfernen
data_ohneAusreisser = data.copy()

###2.5.1 Alter

In [None]:
plt.figure(figsize=(13,8))
sns.distplot(data_ohneAusreisser.Alter, color='purple');

In [None]:
fig = make_subplots(rows=1, cols=3)

fig.add_trace(go.Box(y=data_ohneAusreisser['Alter'], notched=True, name='Alter', marker_color = '#6699ff', 
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 2)

fig.update_layout(title_text='Boxplot für das Alter')

fig.show()

###2.5.2 Flugdistanz

####2.5.2.1 Mit Ausreißer

In [None]:
plt.figure(figsize=(13,8))
sns.distplot(data.Flugdistanz, color='purple');

In [None]:
fig = make_subplots(rows=1, cols=3)

fig.add_trace(go.Box(y=data['Flugdistanz'], notched=True, name='Flugdistanz', marker_color = '#6699ff', 
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 2)

fig.update_layout(title_text='Boxplot für die Flugdistanz')

fig.show()

####2.5.2.2 Ohne Ausreißer

In [None]:
data_ohneAusreisser.drop(data_ohneAusreisser[data_ohneAusreisser['Flugdistanz'] > 4000].index, inplace = True)

In [None]:
plt.figure(figsize=(13,8))
sns.distplot(data_ohneAusreisser.Flugdistanz, color='purple');

In [None]:
fig = make_subplots(rows=1, cols=3)

fig.add_trace(go.Box(y=data_ohneAusreisser['Flugdistanz'], notched=True, name='Flugdistanz', marker_color = '#6699ff', 
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 2)

fig.update_layout(title_text='Boxplot für die Flugdistanz ohne Ausreißer')

fig.show()

###2.5.3 Gesamtverspätung

####2.5.3.1 Mit Ausreißer

In [None]:
plt.figure(figsize=(13,8))
sns.distplot(data_ohneAusreisser['GesamtverspaetungInMinuten'], color='purple');

In [None]:
fig = make_subplots(rows=1, cols=3)

fig.add_trace(go.Box(y=data_ohneAusreisser['GesamtverspaetungInMinuten'], notched=True, name='GesamtverspaetungInMinuten', marker_color = '#6699ff', 
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 2)

fig.update_layout(title_text='Boxplot für die Gesamtverspätung mit Ausreißer')

fig.show()

####2.5.3.2 Ohne Ausreißer

In [None]:
data_ohneAusreisser.drop(data_ohneAusreisser[data_ohneAusreisser['GesamtverspaetungInMinuten'] > 300].index, inplace = True)

In [None]:
plt.figure(figsize=(13,8))
sns.distplot(data_ohneAusreisser['GesamtverspaetungInMinuten'], color='purple');

In [None]:
fig = make_subplots(rows=1, cols=3)

fig.add_trace(go.Box(y=data_ohneAusreisser['GesamtverspaetungInMinuten'], notched=True, name='GesamtverspaetungInMinuten', marker_color = '#6699ff', 
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 2)

fig.update_layout(title_text='Boxplot für die Gesamtverspätung ohne Ausreißer')

fig.show()

###2.5.4 Punkte Zufriedenheit summiert

In [None]:
plt.figure(figsize=(13,8))
sns.distplot(data_ohneAusreisser['GesamtzufriedenheitSummiert'], color='purple');

In [None]:
fig = make_subplots(rows=1, cols=3)

fig.add_trace(go.Box(y=data_ohneAusreisser['GesamtzufriedenheitSummiert'], notched=True, name='GesamtzufriedenheitSummiert', marker_color = '#6699ff', 
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 2)

fig.update_layout(title_text='Boxplot für die Gesamtzufriedenheit summiert')

fig.show()

#3. Analyse

---

##3.1 Kategorienverteilung checken

In [None]:
categorical = [var for var in data_ohneAusreisser.columns if data_ohneAusreisser[var].dtype=='O']
categorical

In [None]:
#prozentuale Verteilung der Spalten im Bezug zur Gesamtanzahl
for var in categorical:
    print(data_ohneAusreisser[var].value_counts() / np.float(len(data_ohneAusreisser)))
    print("\n\n")

##3.2 Korrelation

In [None]:
corrmat = data_ohneAusreisser.corr()

plt.figure(figsize=(24,13))  
sns.heatmap(corrmat, annot = True, cmap = 'RdBu', center = 0, vmin=-1, vmax=1)
plt.title('Correlation Heatmap',fontsize=14)

In [None]:
plt.figure(figsize=(13,8))
sns.scatterplot(x=data_ohneAusreisser['AbflugverspaetungInMinuten'], y=data_ohneAusreisser['GesamtverspaetungInMinuten'], color='#cc0000');

#4. Clustern Vorbereitung

##4.1 Spalten numerisch machen

In [None]:
#datensatz umbenennen
data_numerisch = data_ohneAusreisser.copy()

In [None]:
# data_numerisch['Geschlecht'] = data_numerisch['Geschlecht'].replace({'Female':0, 'Male':1})
# data_numerisch['IstZufrieden'] = data_numerisch['IstZufrieden'].replace({'satisfied':1, 'neutral or dissatisfied':0})
# data_numerisch['PassagierTyp'] = data_numerisch['PassagierTyp'].replace({'Loyal Customer':1, 'disloyal Customer':0})
# data_numerisch['Klasse'] = data_numerisch['Klasse'].replace({'Business':0, 'Eco':1, 'Eco Plus':2})
# data_numerisch['Reisetyp'] = data_numerisch['Reisetyp'].replace({'Business travel':0, 'Personal Travel':1})

In [None]:
data_numerisch.head()

In [None]:
data_numerisch.dtypes

##4.2 Nicht benötgite Spalten entfernen

In [None]:
#datensatz umbenennen
data_final = data_numerisch.copy()

In [None]:
data_final.head()

##4.3 Feature Scaling

In [None]:
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_final), columns = data_final.columns)

In [None]:
data_scaled.head()

##4.4 Dimensionsreduktion

###4.2.1 PCA mit 28 Features

In [None]:
pca28 = PCA()
pca28.fit(data_scaled)

#50% meiner Features machen 80% meiner Varianz aus
plt.figure(figsize=(12, 9))
plt.plot(range(1, 30), pca28.explained_variance_ratio_.cumsum(), marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')

###4.2.2 PCA mit 17 & 20 Features (90-95% meiner Varianz)

In [None]:
#17
pca17 = PCA(n_components=17)
pca17.fit(data_scaled)

df_pca17_components = pd.DataFrame(
    data=pca17.components_.round(4),
    columns=data_scaled.columns.values,
    index=['component 1', 'component 2', 'component 3', 'component 4', 'component 5', 'component 6', 'component 7', 'component 8', 'component 9', 'component 10', 'component 11', 'component 12','component 13', 'component 14', 'component 15', 'component 16', 'component 17'])

df_pca17_components
#------------------------------------------------------------------------------
plt.figure(figsize=(24,6))  
sns.heatmap(df_pca17_components, annot = True, cmap = 'RdBu', center = 0, vmin=-1, vmax=1)
plt.title('Correlation Heatmap',fontsize=14)

In [None]:
#20
pca20 = PCA(n_components=20)
pca20.fit(data_scaled)

df_pca20_components = pd.DataFrame(
    data=pca20.components_.round(4),
    columns=data_scaled.columns.values,
    index=['component 1', 'component 2', 'component 3', 'component 4', 'component 5', 'component 6', 'component 7', 'component 8', 'component 9', 'component 10', 'component 11', 'component 12','component 13', 'component 14', 'component 15', 'component 16', 'component 17', 'component 18', 'component 19', 'component 20'])

df_pca20_components
#------------------------------------------------------------------------------
plt.figure(figsize=(24,6))  
sns.heatmap(df_pca20_components, annot = True, cmap = 'RdBu', center = 0, vmin=-1, vmax=1)
plt.title('Correlation Heatmap',fontsize=14)

###4.2.3 PCA mit 9-12 Features (70-80% meiner Varianz)

In [None]:
#12
pca12 = PCA(n_components=12)
pca12.fit(data_scaled)

df_pca12_components = pd.DataFrame(
    data=pca12.components_.round(4),
    columns=data_scaled.columns.values,
    index=['component 1', 'component 2', 'component 3', 'component 4', 'component 5', 'component 6', 'component 7', 'component 8', 'component 9', 'component 10', 'component 11', 'component 12'])

df_pca12_components
#------------------------------------------------------------------------------
plt.figure(figsize=(24,6))  
sns.heatmap(df_pca12_components, annot = True, cmap = 'RdBu', center = 0, vmin=-1, vmax=1)
plt.title('Correlation Heatmap',fontsize=14)

In [None]:
#11
pca11 = PCA(n_components=11)
pca11.fit(data_scaled)

df_pca11_components = pd.DataFrame(
    data=pca11.components_.round(4),
    columns=data_scaled.columns.values,
    index=['component 1', 'component 2', 'component 3', 'component 4', 'component 5', 'component 6', 'component 7', 'component 8', 'component 9', 'component 10', 'component 11'])

df_pca11_components
#------------------------------------------------------------------------------
plt.figure(figsize=(24,6))  
sns.heatmap(df_pca11_components, annot = True, cmap = 'RdBu', center = 0, vmin=-1, vmax=1)
plt.title('Correlation Heatmap',fontsize=14)

In [None]:
#10
pca10 = PCA(n_components=10)
pca10.fit(data_scaled)

df_pca10_components = pd.DataFrame(
    data=pca10.components_.round(4),
    columns=data_scaled.columns.values,
    index=['component 1', 'component 2', 'component 3', 'component 4', 'component 5', 'component 6', 'component 7', 'component 8', 'component 9', 'component 10'])

df_pca10_components
#------------------------------------------------------------------------------
plt.figure(figsize=(24,6))  
sns.heatmap(df_pca10_components, annot = True, cmap = 'RdBu', center = 0, vmin=-1, vmax=1)
plt.title('Correlation Heatmap',fontsize=14)

In [None]:
#9
pca9 = PCA(n_components=9)
pca9.fit(data_scaled)

df_pca9_components = pd.DataFrame(
    data=pca9.components_.round(4),
    columns=data_scaled.columns.values,
    index=['component 1', 'component 2', 'component 3', 'component 4', 'component 5', 'component 6', 'component 7', 'component 8', 'component 9'])

df_pca9_components
#------------------------------------------------------------------------------
plt.figure(figsize=(24,6))  
sns.heatmap(df_pca9_components, annot = True, cmap = 'RdBu', center = 0, vmin=-1, vmax=1)
plt.title('Correlation Heatmap',fontsize=14)

#5. Clustern mit K-Means

---

##4.1 Elbow method

In [None]:
#20 Komponenten = 95% meiner Varianz
Elbow_M = KElbowVisualizer(KMeans(), k=20)
Elbow_M.fit(data_scaled)
Elbow_M.show();

In [None]:
#17 Komponenten = 90% meiner Varianz
Elbow_M = KElbowVisualizer(KMeans(), k=17)
Elbow_M.fit(data_scaled)
Elbow_M.show();

##4.2 K-Means

In [None]:
kmeans = KMeans(n_clusters=8, init='k-means++', random_state=42)
df = kmeans.fit_transform(data_scaled)

In [None]:
data_kmeans = data_scaled.copy()
data_kmeans['Clusters'] = kmeans.labels_

In [None]:
data_analysis = data_kmeans.groupby('Clusters').mean().round(3)
data_analysis

In [None]:
#predict the labels of clusters.
labels = kmeans.fit_predict(data_scaled)
print(label)

#Getting unique labels
u_labels = np.unique(labels)
print(u_labels)

In [None]:
#plotting the results:
for i in u_labels:
    plt.scatter(df[label == i , 1] , df[label == i , 3] , label = i)
plt.legend()
plt.show()

#6. Hierarchical Clustering

---