In [None]:
import pandas as pd
import numpy as np
import glob
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report
import warnings
import xgboost as xgb
from tqdm import tqdm
from xgboost.callback import TrainingCallback
from scipy.stats import chi2_contingency
from scipy import stats

warnings.filterwarnings('ignore')

categorical_features = ['UniqueCarrier', 'TailNum', 'Status']
label_encoders = {}
scaler = StandardScaler()

def preprocess_data(file_pattern, include_status=True):
  csv_files = glob.glob(file_pattern)
  data_list = [pd.read_csv(file) for file in csv_files]
  if not data_list:
    raise ValueError(f"No files found for pattern {file_pattern}")
  data = pd.concat(data_list, ignore_index=True)
  
  columns_to_keep = [
    'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier',
    'TailNum', 'FlightNum', 'OriginAirportID', 'OriginCityMarketID',
    'OriginStateFips', 'OriginWac', 'DestAirportID', 'DestCityMarketID',
    'DestStateFips', 'DestWac', 'CRSDepTime', 'CRSArrTime', 'Flights',
    'Distance']
  if include_status:
    columns_to_keep.append('Status')
  data = data[columns_to_keep]
  
  for feature in [f for f in categorical_features if f in data.columns]:
    if feature not in label_encoders:
      label_encoders[feature] = LabelEncoder()
      data[feature] = label_encoders[feature].fit_transform(data[feature])
    else:
      data[feature] = label_encoders[feature].transform(data[feature])
  
  data['FlightNum'] = pd.to_numeric(data['FlightNum'], errors='coerce')
  data = data.dropna()
  
  features_to_scale = [col for col in data.columns if col != 'Status']
  data[features_to_scale] = scaler.fit_transform(data[features_to_scale])
  
  return data

# Process training data
print("Start reading training data...")
train_data = preprocess_data('flights-data/*.csv')
X = train_data.drop('Status', axis=1)
y = train_data['Status']
print(f"Final processed training data size: {train_data.shape}")

# Process test data
test_data = preprocess_data('test-data/*.csv', include_status=False)
X_test = test_data.copy()
print(f"Final processed test data size: {test_data.shape}")

# Train model using cross-validation
print("\nStart training model...")

class TqdmCallback(TrainingCallback):
  def __init__(self, total):
    super().__init__()
    self.pbar = tqdm(total=total, desc='Training')
  
  def after_iteration(self, model, epoch, evals_log):
    self.pbar.update(1)
    return False
  
  def after_training(self, model):
    self.pbar.close()
    return model

# Initialize
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3407)
f1_scores = []
models = []
test_predictions = []

# Cross-validation training
for fold, (train_index, valid_index) in enumerate(skf.split(X, y), 1):
  print(f"\nFold {fold}:")
  X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
  y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
  
  clf = xgb.XGBClassifier(
    n_estimators=10000,
    max_depth=20,
    learning_rate=0.5,
    subsample=0.85,
    colsample_bytree=0.75,
    objective='multi:softmax',
    random_state=3407,
    n_jobs=-1,
    callbacks=[TqdmCallback(total=10000)]
  )
  
  # Train model
  clf.fit(X_train, y_train)
  models.append(clf)
  
  # Validation set evaluation
  y_valid_pred = clf.predict(X_valid)
  f1 = f1_score(y_valid, y_valid_pred, average='macro')
  f1_scores.append(f1)
  print(f"Fold {fold} F1 score: {f1:.4f}")
  print("Classification report:")
  print(classification_report(y_valid, y_valid_pred, 
                target_names=label_encoders['Status'].classes_))
  
  # Test set prediction
  test_pred = clf.predict(X_test)
  test_predictions.append(test_pred)

print(f"\nCross-validation average F1 score: {np.mean(f1_scores):.4f}")
print(f"F1 score standard deviation: {np.std(f1_scores):.4f}")


In [None]:
# Ensemble prediction
print("\nStarting ensemble prediction...")
test_predictions = np.array(test_predictions)
y_test_pred = stats.mode(test_predictions, axis=0).mode.flatten()

# Save prediction results
original_test_data = pd.read_csv('test-data/test_data.csv')
original_test_data['Status'] = label_encoders['Status'].inverse_transform(y_test_pred)
output_file_with_original = 'test_data_with_predictions.csv'
original_test_data.to_csv(output_file_with_original, index=False)
print(f"Prediction results have been inserted into the original test data and saved to {output_file_with_original}")

# Label distribution analysis
print("\nLabel distribution analysis...")
train_label_counts = np.bincount(y.to_numpy())
test_label_counts = np.bincount(y_test_pred, minlength=len(train_label_counts))

print("\nTraining set label distribution:")
for label, count in enumerate(train_label_counts):
  label_name = label_encoders['Status'].classes_[label]
  percentage = (count / len(y)) * 100
  print(f"Label {label_name}: {count} ({percentage:.2f}%)")

print("\nTest set predicted label distribution:")
for label, count in enumerate(test_label_counts):
  label_name = label_encoders['Status'].classes_[label]
  percentage = (count / len(y_test_pred)) * 100
  print(f"Label {label_name}: {count} ({percentage:.2f}%)")

# Distribution difference test
observed = np.array([train_label_counts, test_label_counts])
chi2, p, dof, expected = chi2_contingency(observed)
print(f"\nDistribution difference test results:")
print(f"Chi-squared statistic: {chi2:.4f}")
print(f"p-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")

if p > 0.05:
  print("There is no significant difference in label distribution between the training set and the test set.")
else:
  print("There is a significant difference in label distribution between the training set and the test set.")
