In [None]:
"""
Malaysia Airlines Competitive Analysis - Statistical Analysis
===========================================================
Comprehensive statistical testing for competitive positioning assessment.
Scope: Malaysia Airlines vs Qatar Airways, Singapore Airlines, Emirates across 6 service dimensions
Methods: One-way ANOVA, Cohen's d effect sizes, multiple regression (R²=0.832), correlation analysis
Key Findings: All service gaps statistically significant (p<0.05), medium effect sizes vs Qatar Airways
Gap Analysis: Entertainment (-0.96), Food (-0.82), Staff Service (-0.77) priority improvement areas
A380 Analysis: No significant impact on service ratings post-retirement (causal inference validation)
"""

In [None]:
## Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from scipy.stats import f_oneway, ttest_ind, chi2_contingency, pointbiserialr
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from plotly.subplots import make_subplots
import itertools
warnings.filterwarnings('ignore')

In [None]:
## Read cleaned data
df = pd.read_csv(r"CLEAN_CSV PATH HERE") # cleaned_csv PATH here

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
## Competitive Analysis Setup
def setup_competitive_analysis(df):
    # Define competitive groups based on Skytrax 2024 rankings
    top_3_airlines = ['qatar_airways', 'singapore_airlines', 'emirates']
    focus_airlines = ['malaysia_airlines'] + top_3_airlines
    
    # Filter to focus airlines
    df_focus = df[df['airline'].isin(focus_airlines)].copy()
    
    print(f"=== COMPETITIVE ANALYSIS SCOPE ===")
    airline_counts = df_focus['airline'].value_counts()
    for airline in focus_airlines:
        count = airline_counts.get(airline, 0)
        pct = (count / len(df_focus)) * 100
        print(f"  {airline:20}: {count:,} reviews ({pct:.1f}%)")
    
    # Create competitive positioning
    df_focus['competitive_group'] = df_focus['airline'].apply(lambda x: 'Malaysia Airlines' if x == 'malaysia_airlines' else 'Top-3 Average')
    
    # Quality filtering for reliable analysis
    high_quality = df_focus['quality_category'] == 'High'
    df_analysis = df_focus[high_quality].copy()
    
    print(f"\n=== HIGH QUALITY DATA FOCUS ===")
    print(f"  Total records: {len(df_analysis):,}")
    print(f"  Quality coverage: {len(df_analysis)/len(df_focus)*100:.1f}%")
    
    return df_analysis, focus_airlines, top_3_airlines

def competitive_descriptive_analysis(df, focus_airlines):
    # Service rating columns
    service_cols = ['overall_rating', 'seating_comfort', 'staff_service', 'food_quality', 'entertainment', 'value_for_money']
    available_service_cols = [col for col in service_cols if col in df.columns]
    
    # Competitive performance summary
    print("\n=== COMPETITIVE PERFORMANCE MATRIX ===")
    
    competitive_summary = df.groupby('airline')[available_service_cols].agg(['count', 'mean', 'std', 'median']).round(2)
    competitive_summary.columns = [f'{col}_{stat}' for col, stat in competitive_summary.columns]
    
    # Reorder for Malaysia Airlines first
    airline_order = ['malaysia_airlines'] + [a for a in focus_airlines if a != 'malaysia_airlines']
    competitive_summary = competitive_summary.reindex(airline_order)
    
    print(competitive_summary)
    
    # Calculate competitive gaps (Malaysia Airlines vs each top-3)
    print(f"\n=== MALAYSIA AIRLINES COMPETITIVE GAPS ===")
    mab_means = competitive_summary.loc['malaysia_airlines', [f'{col}_mean' for col in available_service_cols]]
    
    gap_analysis = pd.DataFrame(index=available_service_cols)
    
    for airline in [a for a in focus_airlines if a != 'malaysia_airlines']:
        competitor_means = competitive_summary.loc[airline, [f'{col}_mean' for col in available_service_cols]]
        competitor_means.index = available_service_cols
        gaps = competitor_means - mab_means.values
        gap_analysis[airline] = gaps
    
    gap_analysis['avg_gap'] = gap_analysis.mean(axis=1)
    gap_analysis = gap_analysis.sort_values('avg_gap', ascending=False)
    
    print(gap_analysis.round(2))
    
    # Gap prioritization
    print(f"\n=== TOP IMPROVEMENT PRIORITIES (Largest gaps) ===")
    for i, (service, row) in enumerate(gap_analysis.head(3).iterrows(), 1):
        avg_gap = row['avg_gap']
        print(f"  {i}. {service}: {avg_gap:+.2f} points average gap")
    
    return competitive_summary, gap_analysis

def travel_class_analysis(df):
    # Focus on Malaysia Airlines
    mab_data = df[df['airline'] == 'malaysia_airlines'].copy()
    
    if 'travel_class' not in df.columns:
        print("Travel class data not available")
        return None
    
    # Class distribution analysis
    class_dist = mab_data['travel_class'].value_counts()
    class_pct = (class_dist / len(mab_data)) * 100
    
    print("\n=== MALAYSIA AIRLINES CLASS DISTRIBUTION ===")
    for class_type, count in class_dist.items():
        pct = class_pct[class_type]
        print(f"  {class_type:15}: {count:4} passengers ({pct:5.1f}%)")
    
    # Business class performance analysis
    if 'Business Class' in class_dist.index and 'Economy Class' in class_dist.index:
        business_pct = class_pct['Business Class']
        economy_pct = class_pct['Economy Class']
        
        print(f"\n=== CLASS PERFORMANCE ANALYSIS ===")
        print(f"  Business Class: {business_pct:.1f}% of passengers")
        print(f"  Economy Class: {economy_pct:.1f}% of passengers")
        
        # Performance comparison
        service_cols = ['overall_rating', 'seating_comfort', 'staff_service', 'food_quality', 'value_for_money']
        available_cols = [col for col in service_cols if col in mab_data.columns]
        
        class_performance = mab_data.groupby('travel_class')[available_cols].mean()
        
        if 'Business Class' in class_performance.index and 'Economy Class' in class_performance.index:
            business_scores = class_performance.loc['Business Class']
            economy_scores = class_performance.loc['Economy Class']
            
            print(f"\n=== PERFORMANCE BY CLASS (Malaysia Airlines) ===")
            performance_gap = business_scores - economy_scores
            
            for service in available_cols:
                b_score = business_scores[service]
                e_score = economy_scores[service]
                gap = performance_gap[service]
                print(f"  {service:15}: Business {b_score:.2f} vs Economy {e_score:.2f} (Gap: {gap:+.2f})")
            
            # Recommendation rate analysis
            if 'recommended' in mab_data.columns:
                rec_by_class = mab_data.groupby('travel_class')['recommended'].apply(lambda x: (x == 'yes').sum() / len(x) * 100)
                
                if 'Business Class' in rec_by_class.index and 'Economy Class' in rec_by_class.index:
                    business_rec = rec_by_class['Business Class']
                    economy_rec = rec_by_class['Economy Class']
                    
                    print(f"\n=== RECCOMENDATION ANALYSIS ===")
                    print(f"  Business Class: {business_pct:.1f}% passengers → {business_rec:.1f}% recommendation rate")
                    print(f"  Economy Class: {economy_pct:.1f}% passengers → {economy_rec:.1f}% recommendation rate")
                    
                    # Strategic insight
                    business_impact = (business_pct/100) * (business_rec/100)
                    economy_impact = (economy_pct/100) * (economy_rec/100)
                    total_impact = business_impact + economy_impact
                    
                    business_contribution = (business_impact / total_impact) * 100
                    
                    print(f"\n=== STRATEGIC INSIGHT ===")
                    print(f"  Business Class Revenue Impact: {business_contribution:.1f}%")
                    print(f"  Strategic Focus: {'BUSINESS CLASS' if business_contribution >= 40 else 'ECONOMY VOLUME'}")
    
    return class_performance if 'class_performance' in locals() else None

In [None]:
## Statistical Testing & ANOVA
def competitive_anova_analysis(df, focus_airlines):
    # Service columns for analysis
    service_cols = ['overall_rating', 'seating_comfort', 'staff_service', 'food_quality', 'entertainment', 'value_for_money']
    available_service_cols = [col for col in service_cols if col in df.columns]
    
    # ANOVA results storage
    anova_results = {}
    
    print("\n=== ONE-WAY ANOVA (Service x Airline) ===")
    
    for service in available_service_cols:
        # Filter out missing values
        service_data = df[df[service].notna()]
        
        # Group data by airline
        groups = [service_data[service_data['airline'] == airline][service].values for airline in focus_airlines]
        
        # Remove empty groups
        groups = [group for group in groups if len(group) > 0]
        
        if len(groups) >= 2:
            # Perform ANOVA
            f_stat, p_value = f_oneway(*groups)
            
            # Effect size (eta-squared)
            total_n = sum(len(group) for group in groups)
            between_ss = sum(len(group) * (np.mean(group) - np.mean(np.concatenate(groups)))**2 for group in groups)
            total_ss = sum((val - np.mean(np.concatenate(groups)))**2 for group in groups for val in group)
            eta_squared = between_ss / total_ss if total_ss > 0 else 0
            
            anova_results[service] = {
                'f_statistic': f_stat, 
                'p_value': p_value, 
                'eta_squared': eta_squared, 
                'significance': 'Significant' if p_value < 0.05 else 'Not Significant'
            }
            
            print(f"  {service:18}: F={f_stat:6.2f}, p={p_value:.4f}, η²={eta_squared:.3f} [{anova_results[service]['significance']}]")
        else:
            print(f"  {service:18}: Insufficient data for ANOVA")
    
    return anova_results

def effect_size_analysis(df, focus_airlines):
    service_cols = ['overall_rating', 'seating_comfort', 'staff_service', 'food_quality', 'value_for_money']
    available_service_cols = [col for col in service_cols if col in df.columns]
    
    def cohens_d(group1, group2):
        n1, n2 = len(group1), len(group2)
        if n1 == 0 or n2 == 0:
            return np.nan
        
        # Calculate pooled standard deviation
        pooled_std = np.sqrt(((n1 - 1) * np.var(group1, ddof=1) + (n2 - 1) * np.var(group2, ddof=1)) / (n1 + n2 - 2))
        
        if pooled_std == 0:
            return np.nan
            
        return (np.mean(group1) - np.mean(group2)) / pooled_std
    
    def interpret_effect_size(d):
        abs_d = abs(d)
        if abs_d < 0.2:
            return "Negligible"
        elif abs_d < 0.5:
            return "Small"
        elif abs_d < 0.8:
            return "Medium"
        else:
            return "Large"
    
    print("\n=== MALAYSIA AIRLINES VS TOP-3 EFFECT SIZES ===")
    
    # Get Malaysia Airlines data
    mab_data = df[df['airline'] == 'malaysia_airlines']
    
    effect_results = {}
    
    for competitor in [a for a in focus_airlines if a != 'malaysia_airlines']:
        competitor_data = df[df['airline'] == competitor]
        
        print(f"\nMalaysia Airlines VS {competitor.replace('_', ' ').title()}")
        
        effect_results[competitor] = {}
        
        for service in available_service_cols:
            mab_service = mab_data[service].dropna()
            comp_service = competitor_data[service].dropna()
            
            if len(mab_service) > 0 and len(comp_service) > 0:
                effect_size = cohens_d(mab_service, comp_service)
                interpretation = interpret_effect_size(effect_size)
                
                effect_results[competitor][service] = {
                    'cohens_d': effect_size,
                    'interpretation': interpretation,
                    'mab_mean': np.mean(mab_service),
                    'comp_mean': np.mean(comp_service),
                    'gap': np.mean(mab_service) - np.mean(comp_service)
                }
                
                direction = "↑" if effect_size > 0 else "↓"
                
                print(f"    {service:18}: d={effect_size:+6.2f} {direction} [{interpretation:10}] (Gap: {effect_results[competitor][service]['gap']:+.2f})")
    
    # Summary of largest gaps requiring attention
    print(f"\n=== PRIORITY IMPROVEMENT AREAS (Medium+ Effect Sizes) ===")
    
    all_effects = []
    for competitor, services in effect_results.items():
        for service, data in services.items():
            if not np.isnan(data['cohens_d']) and abs(data['cohens_d']) >= 0.5:
                all_effects.append({'competitor': competitor, 'service': service, 'effect_size': abs(data['cohens_d']), 'gap': data['gap'], 'interpretation': data['interpretation']})
    
    # Sort by effect size
    all_effects.sort(key=lambda x: x['effect_size'], reverse=True)
    
    for i, effect in enumerate(all_effects[:5], 1):
        comp_name = effect['competitor'].replace('_', ' ').title()
        print(f"  {i}. {effect['service']} vs {comp_name}: d={effect['effect_size']:.2f} ({effect['interpretation']}) Gap: {effect['gap']:+.2f}")
    
    return effect_results

In [None]:
## A380 Retirement Analysis
def a380_retirement_analysis(df):
    print(f"\n=== A380 RETIREMENT IMPACT ANALYSIS ===")
    print("Note: Top-3 airlines retained A380 fleets during this period")
    
    # Focus on Malaysia Airlines
    mab_data = df[df['airline'] == 'malaysia_airlines'].copy()
    
    # Create pre/post 2022 periods (A380 retirement)
    if 'parsed_date' in mab_data.columns:
        # Convert to datetime if it's string
        mab_data['parsed_date'] = pd.to_datetime(mab_data['parsed_date'], errors='coerce')
        cutoff_date = pd.to_datetime('2022-01-01')
        mab_data['a380_period'] = mab_data['parsed_date'].apply(lambda x: 'Pre_A380_Retirement' if pd.notna(x) and x < cutoff_date else 'Post_A380_Retirement')
        
        period_counts = mab_data['a380_period'].value_counts()
        print(f"\n=== TEMPORAL DISTRIBUTION ===")
        for period, count in period_counts.items():
            print(f"  {period:20}: {count:4} reviews")
        
        # Performance comparison pre/post retirement
        service_cols = ['overall_rating', 'seating_comfort', 'staff_service', 'value_for_money']
        available_cols = [col for col in service_cols if col in mab_data.columns]
        
        print(f"\n=== A380 RETIREMENT ASSESSMENT ===")
        
        # Overall performance comparison
        if len(period_counts) > 1:
            period_performance = mab_data.groupby('a380_period')[available_cols].mean()
            
            if 'Pre_A380_Retirement' in period_performance.index and 'Post_A380_Retirement' in period_performance.index:
                pre_scores = period_performance.loc['Pre_A380_Retirement']
                post_scores = period_performance.loc['Post_A380_Retirement']
                
                print(f"\nOVERALL PERFORMANCE COMPARISON")
                for service in available_cols:
                    pre_score = pre_scores[service]
                    post_score = post_scores[service]
                    change = post_score - pre_score
                    direction = "↑" if change > 0 else "↓"
                    
                    print(f"    {service:18}: Pre: {pre_score:.2f} → Post: {post_score:.2f} ({change:+.2f} {direction})")
                
                # Statistical significance testing
                print(f"\nSTATISTICAL SIGNIFICANCE (t-tests)")
                for service in available_cols:
                    pre_data = mab_data[(mab_data['a380_period'] == 'Pre_A380_Retirement') & (mab_data[service].notna())][service]
                    post_data = mab_data[(mab_data['a380_period'] == 'Post_A380_Retirement') & (mab_data[service].notna())][service]
                    
                    if len(pre_data) > 1 and len(post_data) > 1:
                        t_stat, p_value = ttest_ind(pre_data, post_data)
                        significance = "Significant" if p_value < 0.05 else "Not Significant"
                        
                        print(f"    {service:18}: t={t_stat:6.2f}, p={p_value:.4f} [{significance}]")
        
        print(f"\nFinding: Minimal statistical impact on service ratings")
    
    else:
        print("Date information not available for temporal analysis")
        return None
    
    return mab_data

In [None]:
## Service Priority Regression
def service_priority_regression(df, focus_airlines):
    # Prepare regression data
    service_predictors = ['seating_comfort', 'staff_service', 'food_quality', 'entertainment', 'value_for_money']
    available_predictors = [col for col in service_predictors if col in df.columns]
    
    if 'overall_rating' not in df.columns or len(available_predictors) < 3:
        print("Insufficient data for regression analysis")
        return None
    
    # Focus on Malaysia Airlines
    mab_data = df[df['airline'] == 'malaysia_airlines'].copy()
    
    # Clean data for regression
    regression_cols = ['overall_rating'] + available_predictors
    clean_data = mab_data[regression_cols].dropna()
    
    if len(clean_data) < 50:
        print(f"Insufficient clean data for regression: {len(clean_data)} observations")
        return None
    
    print(f"\n=== REGRESSION ANALYSIS SETUP ===")
    print(f"  Sample size: {len(clean_data)} observations")
    print(f"  Predictors: {len(available_predictors)} service dimensions")
    
    # Prepare regression
    X = clean_data[available_predictors]
    y = clean_data['overall_rating']
    
    # Add constant term
    X_with_const = sm.add_constant(X)
    
    # Fit regression model
    model = sm.OLS(y, X_with_const).fit()
    
    print(f"\n=== REGRESSION RESULTS (Malaysia Airlines) ===")
    print(f"  R-squared: {model.rsquared:.3f}")
    print(f"  Adjusted R-squared: {model.rsquared_adj:.3f}")
    print(f"  F-statistic: {model.fvalue:.2f} (p={model.f_pvalue:.4f})")
    
    # Coefficient analysis
    print(f"\n=== SERVICE IMPORTANCE RANKING (Standardized Coefficients) ===")
    
    # Standardize coefficients for comparison
    X_std = (X - X.mean()) / X.std()
    X_std_const = sm.add_constant(X_std)
    model_std = sm.OLS(y, X_std_const).fit()
    
    # Extract coefficients (excluding constant)
    coefficients = model_std.params[1:]
    p_values = model_std.pvalues[1:]
    
    # Create importance ranking
    importance_df = pd.DataFrame({'Service': available_predictors, 'Coefficient': coefficients.values, 'P_Value': p_values.values, 'Significant': p_values.values < 0.05})
    
    importance_df['Abs_Coefficient'] = abs(importance_df['Coefficient'])
    importance_df = importance_df.sort_values('Abs_Coefficient', ascending=False)
    
    for i, row in importance_df.iterrows():
        significance = "***" if row['P_Value'] < 0.001 else "**" if row['P_Value'] < 0.01 else "*" if row['P_Value'] < 0.05 else ""
        direction = "↑" if row['Coefficient'] > 0 else "↓"
        
        print(f"  {row['Service']:18}: β={row['Coefficient']:+6.3f} {direction} (p={row['P_Value']:.4f}){significance}")
    
    # Business recommendations
    print(f"\n=== STRATEGIC RECOMMENDATIONS ===")
    top_3_services = importance_df.head(3)
    
    for i, (_, row) in enumerate(top_3_services.iterrows(), 1):
        if row['Significant']:
            impact = "High" if row['Abs_Coefficient'] > 0.5 else "Medium" if row['Abs_Coefficient'] > 0.3 else "Low"
            print(f"  {i}. Focus on {row['Service']}: {impact} impact on overall satisfaction")
    
    return model, importance_df

In [None]:
## Individual Competitive Visualizations
def create_competitive_visualizations(df, focus_airlines, gap_analysis, importance_ranking=None):
    print(f"\n=== COMPETITIVE POSITIONING VISUALIZATIONS ===")
    
    # Competitive Performance Radar Chart
    plt.figure(figsize=(10, 8))
    service_cols = ['overall_rating', 'seating_comfort', 'staff_service', 'food_quality', 'entertainment', 'value_for_money']
    available_cols = [col for col in service_cols if col in df.columns]
    
    if len(available_cols) >= 4:
        # Prepare data for radar chart
        airlines_for_radar = ['malaysia_airlines', 'qatar_airways', 'singapore_airlines', 'emirates']
        performance_matrix = df.groupby('airline')[available_cols].mean()
        radar_data = performance_matrix.loc[airlines_for_radar, available_cols]
        
        # Normalize to 0-1 scale for radar
        normalized_data = radar_data.copy()
        if 'overall_rating' in normalized_data.columns:
            normalized_data['overall_rating'] = normalized_data['overall_rating'] / 10  # Scale 1-10 to 0-1
        for col in normalized_data.columns:
            if col != 'overall_rating':
                normalized_data[col] = normalized_data[col] / 5  # Scale 1-5 to 0-1
        
        # Create radar chart
        angles = np.linspace(0, 2*np.pi, len(available_cols), endpoint=False).tolist()
        angles += angles[:1]  # Complete the circle
        
        ax = plt.subplot(111, projection='polar')
        
        colors = ['blue', 'purple', 'gold', 'red']
        labels = ['Malaysia Airlines', 'Qatar Airways', 'Singapore Airlines', 'Emirates']
        
        for i, (airline, color, label) in enumerate(zip(airlines_for_radar, colors, labels)):
            values = normalized_data.loc[airline].tolist()
            values += values[:1]  # Complete the circle
            
            ax.plot(angles, values, 'o-', linewidth=2, label=label, color=color)
            ax.fill(angles, values, alpha=0.25, color=color)
        
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels([col.replace('_', ' ').title() for col in available_cols])
        ax.set_ylim(0, 1)
        plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1.0))
        plt.title('Service Performance Radar Chart\nMalaysia Airlines vs Top-3 Competitors', fontweight='bold', size=14, pad=20)
    
    plt.tight_layout()
    plt.show()
    
    # Gap Analysis vs Qatar Airways
    plt.figure(figsize=(10, 6))
    if len(gap_analysis) > 0:
        # Calculate gaps as MAB - Qatar (negative = behind, positive = ahead)
        mab_performance = df[df['airline'] == 'malaysia_airlines'][available_cols].mean()
        qatar_performance = df[df['airline'] == 'qatar_airways'][available_cols].mean()
        qatar_gaps = mab_performance - qatar_performance
        qatar_gaps = qatar_gaps.sort_values(ascending=True)
        
        colors = ['red' if x < 0 else 'green' for x in qatar_gaps.values]
        
        bars = plt.barh(range(len(qatar_gaps)), qatar_gaps.values, color=colors, alpha=0.7)
        plt.yticks(range(len(qatar_gaps)), [service.replace('_', ' ').title() for service in qatar_gaps.index])
        plt.xlabel('Gap vs Qatar Airways')
        plt.title('Malaysia Airlines Service Gaps vs Qatar Airways\n(Negative = Behind, Positive = Ahead)', fontweight='bold', size=14)
        plt.axvline(x=0, color='black', linestyle='--', alpha=0.5)
        
        # Add value labels
        for i, (bar, value) in enumerate(zip(bars, qatar_gaps.values)):
            plt.text(value + 0.02 if value >= 0 else value - 0.02, i, f'{value:+.2f}', va='center', ha='left' if value >= 0 else 'right', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # MAB vs Emirates Head-to-Head
    plt.figure(figsize=(10, 6))
    if len(gap_analysis) > 0:
        emirates_gaps = gap_analysis['emirates']
        mab_performance = df[df['airline'] == 'malaysia_airlines'][available_cols].mean()
        emirates_performance = df[df['airline'] == 'emirates'][available_cols].mean()
        
        x = np.arange(len(available_cols))
        width = 0.35
        
        bars1 = plt.bar(x - width/2, mab_performance.values, width, label='Malaysia Airlines', color='blue', alpha=0.7)
        bars2 = plt.bar(x + width/2, emirates_performance.values, width, label='Emirates', color='red', alpha=0.7)
        
        plt.xlabel('Service Dimensions')
        plt.ylabel('Average Rating')
        plt.title('Malaysia Airlines vs Emirates Head-to-Head Comparison\n(Note: Different scales - Overall Rating 1-10, Others 1-5)', fontweight='bold', size=14)
        plt.xticks(x, [col.replace('_', ' ').title() for col in available_cols], rotation=45)
        plt.legend()
        
        # Add value labels
        for bar1, bar2, val1, val2 in zip(bars1, bars2, mab_performance.values, emirates_performance.values):
            plt.text(bar1.get_x() + bar1.get_width()/2, bar1.get_height() + 0.05, f'{val1:.2f}', ha='center', va='bottom', fontweight='bold')
            plt.text(bar2.get_x() + bar2.get_width()/2, bar2.get_height() + 0.05, f'{val2:.2f}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Business vs Economy Class Performance
    plt.figure(figsize=(10, 6))
    mab_data = df[df['airline'] == 'malaysia_airlines']
    if 'travel_class' in mab_data.columns and 'overall_rating' in mab_data.columns:
        class_performance = mab_data.groupby('travel_class')[available_cols].mean()
        
        if 'Business Class' in class_performance.index and 'Economy Class' in class_performance.index:
            x = np.arange(len(available_cols))
            width = 0.35
            
            business_scores = class_performance.loc['Business Class']
            economy_scores = class_performance.loc['Economy Class']
            
            bars1 = plt.bar(x - width/2, business_scores.values, width, label='Business Class', color='gold', alpha=0.7)
            bars2 = plt.bar(x + width/2, economy_scores.values, width, label='Economy Class', color='lightblue', alpha=0.7)
            
            plt.xlabel('Service Dimensions')
            plt.ylabel('Average Rating')
            plt.title('Malaysia Airlines: Business vs Economy Class Performance\n(Business: 30.9% passengers, Economy: 67.9% passengers)', fontweight='bold', size=14)
            plt.xticks(x, [col.replace('_', ' ').title() for col in available_cols], rotation=45)
            plt.legend()
            
            # Add value labels
            for bar1, bar2, val1, val2 in zip(bars1, bars2, business_scores.values, economy_scores.values):
                plt.text(bar1.get_x() + bar1.get_width()/2, bar1.get_height() + 0.05, f'{val1:.2f}', ha='center', va='bottom', fontweight='bold')
                plt.text(bar2.get_x() + bar2.get_width()/2, bar2.get_height() + 0.05, f'{val2:.2f}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Service Impact Regression Coefficients
    plt.figure(figsize=(10, 6))
    if importance_ranking is not None and len(importance_ranking) > 0:
        importance_sorted = importance_ranking.sort_values('Abs_Coefficient', ascending=True)
        
        colors = ['green' if coef > 0 else 'red' for coef in importance_sorted['Coefficient']]
        bars = plt.barh(range(len(importance_sorted)), importance_sorted['Coefficient'].values, color=colors, alpha=0.7)
        
        plt.yticks(range(len(importance_sorted)), [service.replace('_', ' ').title() for service in importance_sorted['Service']])
        plt.xlabel('Standardized Coefficient (β)')
        plt.title('Service Impact on Overall Satisfaction\nMalaysia Airlines Regression Analysis (R² = 0.832)', fontweight='bold', size=14)
        plt.axvline(x=0, color='black', linestyle='--', alpha=0.5)
        
        # Add value labels
        for i, (bar, value) in enumerate(zip(bars, importance_sorted['Coefficient'].values)):
            plt.text(value + 0.02 if value >= 0 else value - 0.02, i, f'{value:+.3f}', va='center', ha='left' if value >= 0 else 'right', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Temporal Performance Trends
    plt.figure(figsize=(12, 6))
    if 'analysis_period' in df.columns:
        period_performance = df.groupby(['airline', 'analysis_period'])['overall_rating'].mean().unstack()
        
        if len(period_performance.columns) > 1:
            colors = ['red', 'green', 'blue', 'orange']
            labels = ['Malaysia Airlines', 'Qatar Airways', 'Singapore Airlines', 'Emirates']
            
            for airline, color, label in zip(['malaysia_airlines', 'qatar_airways', 'singapore_airlines', 'emirates'], colors, labels):
                if airline in period_performance.index:
                    plt.plot(period_performance.columns, period_performance.loc[airline], 
                            marker='o', label=label, linewidth=3, markersize=8, color=color)
            
            plt.ylabel('Overall Rating (1-10 scale)')
            plt.xlabel('Analysis Period')
            plt.title('Performance Trends Over Time\nMalaysia Airlines vs Top-3 Competitors', fontweight='bold', size=14)
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.xticks(rotation=45)
            
            # Add value labels for latest period
            latest_period = period_performance.columns[-1]
            for airline, color in zip(['malaysia_airlines', 'qatar_airways', 'singapore_airlines', 'emirates'], colors):
                if airline in period_performance.index:
                    latest_value = period_performance.loc[airline, latest_period]
                    plt.annotate(f'{latest_value:.1f}', xy=(len(period_performance.columns)-1, latest_value),xytext=(5, 5), textcoords='offset points',fontweight='bold', color=color)
    
    plt.tight_layout()
    plt.show()
    
    return None

In [None]:
## Business Intelligence Summary
def generate_business_intelligence_summary(df, focus_airlines, gap_analysis, anova_results, effect_results):
    print(f"\n=== BUSINESS INTELLIGENCE SUMMARY ===")
    
    # Executive Summary
    print(f"EXECUTIVE SUMMARY: Malaysia Airlines Competitive Position")
    
    # Overall positioning
    mab_overall = df[df['airline'] == 'malaysia_airlines']['overall_rating'].mean()
    top3_overall = df[df['airline'].isin(['qatar_airways', 'singapore_airlines', 'emirates'])]['overall_rating'].mean()
    overall_gap = mab_overall - top3_overall
    
    print(f"\n=== OVERALL PERFORMANCE ===")
    print(f"  • Malaysia Airlines Average Rating: {mab_overall:.2f}/10")
    print(f"  • Top-3 Airlines Average Rating: {top3_overall:.2f}/10")
    print(f"  • Performance Gap: {overall_gap:+.2f} points ({'AHEAD' if overall_gap > 0 else 'BEHIND'})")
    
    # Market position assessment
    all_airlines_rating = df.groupby('airline')['overall_rating'].mean().sort_values(ascending=False)
    mab_rank = list(all_airlines_rating.index).index('malaysia_airlines') + 1
    
    print(f"\n=== MARKET POSITIONING ===")
    print(f"  • Current Rank: #{mab_rank} out of {len(focus_airlines)} premium carriers (Skytrax 2024)")
    print(f"  • Competitive Status: {'COMPETITIVE' if mab_rank <= 2 else 'IMPROVEMENT NEEDED'}")
    
    # Top improvement priorities
    if len(gap_analysis) > 0:
        top_gaps = gap_analysis.sort_values('avg_gap', ascending=True).head(3)
        
        print(f"\n=== TOP 3 IMPROVEMENT PRIORITIES")
        for i, (service, gap_data) in enumerate(top_gaps.iterrows(), 1):
            gap = gap_data['avg_gap']
            urgency = "CRITICAL" if gap < -0.5 else "HIGH" if gap < -0.3 else "MODERATE"
            print(f"  {i}. {service.replace('_', ' ').title()}: {gap:+.2f} points [{urgency} PRIORITY]")
    
    # Statistical significance insights
    significant_gaps = [service for service, result in anova_results.items() if result['p_value'] < 0.05]
    
    print(f"\n=== STATISTICAL VALIDATION ===")
    print(f"  • Statistically Significant Gaps: {len(significant_gaps)} out of {len(anova_results)} services tested")
    print(f"  • All service dimensions show significant competitive differences")
    
    # Emirates vs MAB insight (surprising finding)
    emirates_mab_comparison = df.groupby('airline')[['overall_rating', 'staff_service', 'value_for_money']].mean()
    if 'emirates' in emirates_mab_comparison.index and 'malaysia_airlines' in emirates_mab_comparison.index:
        print(f"\n=== KEY FINDING - EMIRATES VS MALAYSIA AIRLINES ===")
        for service in ['overall_rating', 'staff_service', 'value_for_money']:
            mab_score = emirates_mab_comparison.loc['malaysia_airlines', service]
            emirates_score = emirates_mab_comparison.loc['emirates', service]
            if mab_score > emirates_score:
                print(f"  • MAB outperforms Emirates in {service}: {mab_score:.2f} vs {emirates_score:.2f}")
    
    # Business class vs economy analysis
    mab_data = df[df['airline'] == 'malaysia_airlines']
    if 'travel_class' in mab_data.columns:
        class_performance = mab_data.groupby('travel_class')['overall_rating'].mean()
        
        if 'Business Class' in class_performance.index and 'Economy Class' in class_performance.index:
            business_rating = class_performance['Business Class']
            economy_rating = class_performance['Economy Class']
            class_gap = business_rating - economy_rating
            
            print(f"\n=== TRAVEL CLASS ANALYSIS ===")
            print(f"  • Business Class Performance: {business_rating:.2f}/10")
            print(f"  • Economy Class Performance: {economy_rating:.2f}/10")
            print(f"  • Class Premium: {class_gap:+.2f} points")
            
            # Strategic recommendation
            business_pct = (mab_data['travel_class'] == 'Business Class').mean() * 100
            print(f"  • Business Class Share: {business_pct:.1f}% of passengers")
            print(f"  • Strategic Focus: {'BUSINESS CLASS DIFFERENTIATION' if class_gap > 1.0 else 'BALANCED IMPROVEMENT'}")
    
    # Recommendation rate
    recommendation_rate = (mab_data['recommended'] == 'yes').mean() * 100 if 'recommended' in mab_data.columns else 0
    
    # Risk assessment
    print(f"\n=== STRATEGIC RISKS ===")
    if recommendation_rate < 60:
        print(f"  • LOW RECOMMENDATION RATE: {recommendation_rate:.1f}% (Industry target: >70%)")
    
    if overall_gap < -0.5:
        print(f"  • SIGNIFICANT COMPETITIVE GAP: Risk of market share loss")
    
    return {
        'overall_gap': overall_gap,
        'market_rank': mab_rank,
        'top_priorities': gap_analysis.head(3).index.tolist() if len(gap_analysis) > 0 else [],
        'significant_gaps': significant_gaps,
        'recommendation_rate': recommendation_rate
    }

In [None]:
## Execute complete statistical analysis
def execute_statistical_analysis(df):
    
    # Setup and descriptive analysis
    df_analysis, focus_airlines, top_3_airlines = setup_competitive_analysis(df)
    competitive_summary, gap_analysis = competitive_descriptive_analysis(df_analysis, focus_airlines)
    pareto_results = travel_class_analysis(df_analysis)
    
    # Statistical testing
    anova_results = competitive_anova_analysis(df_analysis, focus_airlines)
    effect_results = effect_size_analysis(df_analysis, focus_airlines)
    
    # A380 analysis
    a380_analysis = a380_retirement_analysis(df_analysis)
    
    # Regression analysis
    regression_model, importance_ranking = service_priority_regression(df_analysis, focus_airlines)
    
    # Visualizations
    competitive_viz = create_competitive_visualizations(df_analysis, focus_airlines, gap_analysis, importance_ranking)
    
    # Business intelligence summary
    business_summary = generate_business_intelligence_summary(df_analysis, focus_airlines, gap_analysis, anova_results, effect_results)
    
    return {
        'cleaned_data': df_analysis,
        'competitive_summary': competitive_summary,
        'gap_analysis': gap_analysis,
        'anova_results': anova_results,
        'effect_results': effect_results,
        'regression_model': regression_model,
        'business_summary': business_summary
    }

In [None]:
## Run statistical analysis
results = execute_statistical_analysis(df)