In [14]:
import random
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from xgboost import XGBClassifier

In [15]:
# Set random seed for reproducibility
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [16]:
# Load data directly from CSV
print('Loading data...')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
print('Data loaded successfully')

Loading data...
Data loaded successfully


In [17]:
# Handle missing values
print('Handling missing values...')
NaN_col = ['Origin_State', 'Destination_State', 'Airline', 
           'Estimated_Departure_Time', 'Estimated_Arrival_Time',
           'Carrier_Code(IATA)', 'Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Missing values handled')

Handling missing values...
Missing values handled


In [18]:
# Encode categorical variables
print('Encoding categorical variables...')
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 
            'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for col in qual_col:
    le = LabelEncoder()
    le = le.fit(train[col])
    train[col] = le.transform(train[col])
    
    # Handle new categories in test data
    for label in np.unique(test[col]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[col] = le.transform(test[col])
print('Categorical variables encoded')

Encoding categorical variables...
Categorical variables encoded


In [19]:
# Remove rows with missing target variable
print('Preparing target variable...')
train = train.dropna(subset=['Delay'])

Preparing target variable...


In [20]:
# Encode target variable
le = LabelEncoder()
train['Delay_num'] = le.fit_transform(train['Delay'])

In [21]:
# Prepare features and target
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

In [22]:
# Split data
print('Splitting training data...')
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, 
                                                 test_size=0.2, 
                                                 random_state=42)

Splitting training data...


In [23]:
# Scale features
print('Scaling features...')
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)
test_x = scaler.transform(test_x)

Scaling features...


In [24]:
# Setup cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define model and hyperparameters
print('Training model...')
model = XGBClassifier(random_state=42)
param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'n_estimators': [100, 200]
}

Training model...


In [25]:
# Perform grid search
grid = GridSearchCV(
    model,
    param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [26]:
# Train model
grid.fit(train_x, train_y)
best_model = grid.best_estimator_

# Evaluate model
print('\nEvaluating model performance...')
val_y_pred = best_model.predict(val_x)
metrics = {
    'Accuracy': accuracy_score(val_y, val_y_pred),
    'F1 Score': f1_score(val_y, val_y_pred, average='weighted'),
    'Precision': precision_score(val_y, val_y_pred, average='weighted'),
    'Recall': recall_score(val_y, val_y_pred, average='weighted')
}

for metric_name, score in metrics.items():
    print(f'{metric_name}: {score:.4f}')

Fitting 5 folds for each of 8 candidates, totalling 40 fits

Evaluating model performance...
Accuracy: 0.8215
F1 Score: 0.7418
Precision: 0.7795
Recall: 0.8215


In [27]:
# Make predictions
print('\nMaking predictions on test set...')
y_pred = best_model.predict_proba(test_x)

# Create submission file
print('Creating submission file...')
submission = pd.DataFrame(
    data=y_pred, 
    columns=sample_submission.columns, 
    index=sample_submission.index
)
submission.to_csv('optimized_submission.csv', index=True)
print('Submission file created successfully')


Making predictions on test set...
Creating submission file...
Submission file created successfully


In [28]:
# Print best parameters
print('\nBest parameters found:')
for param, value in grid.best_params_.items():
    print(f'{param}: {value}')


Best parameters found:
learning_rate: 0.1
max_depth: 5
n_estimators: 100
