In [None]:
import pandas as pd # Importing pandas for data manipulation
import matplotlib.pyplot as plt # Importing matplotlib for plotting
import seaborn as sns # Importing seaborn for enhanced visualization

# Load the cleaned dataset
new_df = pd.read_csv("C:\\Users\\USER\\Desktop\\py\\APS_analysis\\data\\processed\\cleaned_data.csv")
pd.set_option('display.max_columns', None) # Show all columns in the DataFrame

In [None]:
catg_cols = ['gender', 'type of travel', 'class', 'customer type', 'satisfaction'] # categorical columns
num_cols = ['age', 'flight distance', 'departure delay in minutes', 'arrival delay in minutes'] # numerical columns

review_cols = ['inflight wifi service', 'departure/arrival time convenient', 'ease of online booking', 'gate location', 'food and drink', 'online boarding', 'seat comfort', 
                 'inflight entertainment', 'on board service', 'leg room service', 'baggage handling', 'checkin service', 'inflight service', 'cleanliness'] # review columns

In [None]:
# Plotting Countplots for categorical columns
fig, axes = plt.subplots(len(catg_cols), 3, figsize=(10, 5*len(catg_cols)), constrained_layout=True) # Create subplot grid
axes = axes.flatten() # Flatten into 1D list for easy iteration

for ax, col in zip(axes, catg_cols): # Loop over categorical columns
    sns.countplot(data=new_df, x=col, hue=col, ax=ax, palette="viridis", legend=False)  # Bar plot counts for each category
    ax.tick_params(axis='x') # Format X-axis ticks

    # Calculate and print percentages for each category
    print(f"\nStatistics for '{col}'")
    col_percentages = new_df[col].value_counts(normalize=True).mul(100).round(2) # Percentage of each category
    print(col_percentages)
    print("-" * (25 + len(col)))

# Hide any unused subplots if the number of plots is not a perfect multiple
for j in range(len(catg_cols), len(axes)):
    fig.delaxes(axes[j])

plt.show()

In [None]:
# Plotting Histograms for numerical columns
fig, axes = plt.subplots(len(num_cols), 2, figsize=(10, 5*len(num_cols)), constrained_layout=True)
axes = axes.flatten()

for ax, col in zip(axes, num_cols): # Loop over numerical columns
    sns.histplot(data=new_df, x=col, kde=True, ax=ax, color='red') # Histogram with KDE line
    
# Hide any unused subplots if the number of plots is not a perfect multiple
for j in range(len(num_cols), len(axes)):
    fig.delaxes(axes[j])

plt.show()

# Print descriptive statistics for all numerical columns
print("\nDescriptive Statistics for Numerical Columns")
print(new_df[num_cols].describe().round(2))
print("-" * 50)

In [None]:
# Create a single large figure for all service ratings
# We have 14 review columns, so a 3x5 grid is suitable (3*5=15 plots)
fig, axes = plt.subplots(3, 5, figsize=(20, 12)) # 3x5 grid for 14 plots
fig.suptitle('Mean Service Ratings by Class', fontsize=22, y=1.0)
axes = axes.flatten() # Flatten the 3x5 grid into a 1D array

# Loop through all review columns and plot them
for i, col in enumerate(review_cols):
    ax = axes[i] # Select the appropriate subplot
    sns.barplot(data=new_df, x='class', y=col, hue= 'class', ax=ax, palette="deep", errorbar=None) # Average rating by class
    ax.set_title(col, fontsize=11)
    ax.set_xlabel('')
    ax.set_ylabel('Mean Rating')

    ax.tick_params(axis='x', rotation=45)

    # Add average values above each bar
    for container in ax.containers:
        ax.bar_label(container, label_type='center', fmt='%.2f%%', color='black', fontsize= 10)

# Hide any unused subplots if the number of plots is not a perfect multiple
for j in range(len(review_cols), len(axes)):
        fig.delaxes(axes[j])

fig.subplots_adjust(top=0.93, hspace=0.7, wspace=0.6)
plt.show()

print(f"\n--- Mean Scores for All Services ---")
all_mean_scores = new_df.groupby('class')[review_cols].mean().round(2)
print(all_mean_scores.T)
print("-" * 50)


In [None]:
# plotting a scatter plot to visualize the relationship between departure delay and arrival delay
fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True) 
sns.scatterplot(data=new_df, x='departure delay in minutes', y='arrival delay in minutes',
                hue='satisfaction', style='satisfaction', ax=ax, palette="magma") 

plt.title("Departure Delay vs Arrival Delay by Satisfaction")
plt.xlabel("Departure Delay in Minutes")
plt.ylabel("Arrival Delay in Minutes")
plt.show()

 # Print correlation and mean delays by satisfaction
print("\n--- Delay Statistics by Satisfaction ---")
delay_corr = new_df[['departure delay in minutes', 'arrival delay in minutes']].corr().iloc[0, 1]
print(f"Correlation between Departure and Arrival Delay: {delay_corr:.2f}")
print("\nMean Delays (in minutes):")
print(new_df.groupby('satisfaction')[['departure delay in minutes', 'arrival delay in minutes']].mean().round(2))
print("-" * 50)

In [None]:
# plotting a correlation heatmap for numerical and review columns
plt.figure(figsize=(12, 8), constrained_layout=True)
corr = new_df[num_cols + review_cols].corr()

sns.heatmap(corr, annot=False, cmap='coolwarm', fmt=".2f", annot_kws={"size": 8}, linewidths=.5, cbar_kws={"shrink": .8})
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Calculate percentages for the table
stacked_data = new_df.groupby('class')['satisfaction'].value_counts(normalize=True).mul(100).round(2).unstack()

# Create the plot
ax = stacked_data.plot(kind='bar', stacked=True, figsize=(10, 7), colormap="vlag", edgecolor='black')
plt.title("Satisfaction by Class (%)", fontsize=16)
plt.ylabel("Percentage of Passengers")
plt.xticks(rotation=0)

# Add percentage labels inside the stacked bars
for container in ax.containers: 
    # The label_type='center' places the text in the middle of the bar segment
    ax.bar_label(container, label_type='center', fmt='%.1f%%', color='white', weight='bold')

plt.legend(title='Satisfaction', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend
plt.show()

# Print the percentage table
print("\nSatisfaction Percentage by Class")
print(stacked_data)