In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# load the data
data = pd.read_csv('../data/benchmark_features.csv')

print(data.shape)
display(data.head())


## Import Require Libraries

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, log_loss

# assign X and y
X, y = data.drop('dropout', axis=1), data['dropout']

# print shape
X.shape, y.shape

## Split the Data
First split into dev and test.  Test should equal 20% of data<br>
Then split the dev data into train and validation.  Validation should equal 20% of data<br>

In [None]:
# split the data to traininig, validation and test sets
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.165, random_state=0)

# split the development set to training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.20, random_state=0)

# print the shape for each dataset
print('Training set:', X_train.shape, y_train.shape)
print('Validation set:', X_val.shape, y_val.shape)
print('Test set:', X_test.shape, y_test.shape)


## Assign columns to dtype for pipelines

In [None]:
# start with int and float
int_columns = [col for col in X_train.columns if X_train[col].dtype == 'int64']
float_columns = [col for col in X_train.columns if X_train[col].dtype == 'float64']

# Combine both lists if you need columns of both types
numeric_columns = int_columns + float_columns

# get categorical columns
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']

### Using C-Index as calssification metric

## Implement Pipeline Objects
`make_scorer()` - Create a scorer object<br>
`ColumnTransformer()` - Create a column transformer object<br>
`Pipeline()` - Create a pipeline object<br>
`IterativeImputer()` - Impute missing values<br>
`StandardScaler()` - Scale features<br>
`SimpleImputer()` - Impute missing values<br>
`OneHotEncoder()` - Encode categorical features<br>
`LogisticRegression()` - Logistic Regression model<br>
`RandomForestClassifier()` - Random Forest model<br>
`XGBClassifier()` - XGBoost model<br>
`GridSearchCV()` - Grid Search object<br>
<br>
## Param Grid Configurations

```python
'LogisticRegression': {'classifier__C': [0.1, 1, 10]},
'RandomForest': {'classifier__n_estimators': [200, 300, 400],
                'classifier__max_depth': [3, 5, 10]},
'XGBoost': {'classifier__learning_rate': [0.01, 0.1, 0.3],
                'classifier__n_estimators': [200, 300, 400]}
```

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import make_scorer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
import helper

# Define the custom scoring function
def cindex_scorer(y_true, y_pred):
    return helper.cindex(y_true, y_pred)

# Create a scorer object
cindex = make_scorer(cindex_scorer, greater_is_better=True)

# Instantiate the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', IterativeImputer(max_iter=10, random_state=0)),
            ('scaler', StandardScaler())
        ]), numeric_columns),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_columns)
    ])

# Define the classifiers
classifiers = {
    'LogisticRegression': LogisticRegression(),
    'RandomForest': RandomForestClassifier(),
    'XGBoost': xgb.XGBClassifier()
}

# Define pipelines for each classifier
pipelines = {
    name: Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', classifier)])
    for name, classifier in classifiers.items()
}

# Define parameter grids for each classifier
param_grids = {
    'LogisticRegression': {'classifier__C': [0.1, 1, 10]},
    'RandomForest': {'classifier__n_estimators': [200, 300, 400], 
                     'classifier__max_depth': [3, 5, 10]},
    'XGBoost': {'classifier__learning_rate': [0.01, 0.1, 0.3],
                'classifier__n_estimators': [200, 300, 400]}
}


# Initialize results storage
results = []

# Run GridSearchCV for each pipeline
for name, pipeline in pipelines.items():
    print(f"Training {name}...")
    grid_search = GridSearchCV(pipeline, param_grids[name], cv=5, scoring=cindex, return_train_score=True)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_train_pred = best_model.predict(X_train)
    y_val_pred = best_model.predict(X_val)
    
    train_accuracy = helper.cindex(y_train, y_train_pred)
    val_accuracy = helper.cindex(y_val, y_val_pred)
    
    print(f"{name} - Train Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    
    results.append({
        'model': name,
        'train_accuracy': train_accuracy,
        'validation_accuracy': val_accuracy,
        'best_params': grid_search.best_params_
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print the best parameters for each model
for index, row in results_df.iterrows():
    print()
    print(f"Model: {row['model']}")
    print()
    print(f"Best Parameters: {row['best_params']}")
    print()


## Now Run the Models on the Unseen Test Data

In [None]:
# run the models on the best parameters and test accuracy on the test set and plot results
results = []

for name, pipeline in pipelines.items():
    best_model = pipeline.set_params(**results_df[results_df['model'] == name]['best_params'].values[0])
    best_model.fit(X_dev, y_dev)
    
    y_test_pred = best_model.predict(X_test)
    test_accuracy = helper.cindex(y_test, y_test_pred)
    
    results.append({
        'model': name,
        'test_accuracy': test_accuracy
    })

    print(f"{name} - Test Accuracy: {test_accuracy:.4f}")

# Convert results to DataFrame
results_df = pd.DataFrame(results)

    

In [None]:
# plot the accuracy on the test set
plt.figure(figsize=(10, 6))
plt.bar(results_df['model'], results_df['test_accuracy'])
plt.title('Test Accuracy')
plt.xlabel('Model')
plt.ylabel('C-Index')
plt.axhline(y=0.8669, color='orange', linestyle='--')
plt.show()
