In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [67]:
df = pd.read_csv("train_data.csv")
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,z217,z218,z219,z220,z221,z222,subject,phase,state,output
0,0,0,0,1,-323.106605,2.694366,-1.98752,261.092784,0.013704,0.0001,...,-0.00493,-0.005554,5.246375,-7.534092,3.530736,-0.539045,K,3,C,1
1,0,0,0,1,-376.084691,0.969696,-6.933765,355.311648,0.030292,-0.000153,...,0.022757,0.052506,-3.727741,-2.854443,-0.699268,-0.054074,A,4,C,1
2,0,0,0,0,91.955425,2.621643,-2.581162,51.357206,0.036668,-0.000104,...,-0.086813,-0.101497,-7.510594,19.564182,-17.00813,4.945392,D,3,C,1
3,0,0,0,1,-391.814586,1.866914,-2.510799,382.900317,0.007947,-2.8e-05,...,0.030856,-0.161398,-6.435819,2.174453,-0.153956,-0.003958,G,2,C,0
4,0,0,0,0,-363.823732,2.951346,-3.726368,330.527539,0.010074,-4e-06,...,-0.017226,-0.016454,-2.581403,3.011932,-1.281361,0.192647,C,2,C,1


In [68]:
# find the value counts for the target variable, phase, and state
print(df['output'].value_counts())
print(df['phase'].value_counts())
print(df['state'].value_counts())   

output
1    3873
0     711
Name: count, dtype: int64
phase
4    1291
1    1215
2    1063
3    1015
Name: count, dtype: int64
state
C    2106
B    1502
D     576
A     398
E       2
Name: count, dtype: int64


In [69]:
# check for missing data
for col in df.columns:
    if df[col].isna().any():
        print(col)

print('No Missing Data')

No Missing Data


### Quick Summary About the Data Types

In [70]:
import pandas as pd

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nData Types:\n", df.dtypes.value_counts())

# Identifying categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove the target variable from features lists if it's included
if 'output' in categorical_cols:
    categorical_cols.remove('output')
elif 'output' in numerical_cols:
    numerical_cols.remove('output')

print(f"\nNumber of Numerical Features: {len(numerical_cols)}")
print(f"Number of Categorical Features: {len(categorical_cols)}")
print(f"Categorical Features: {categorical_cols}")
print(f"Numerical Features: {numerical_cols}")


Dataset Shape: (4584, 670)

Data Types:
 float64    474
int64      194
object       2
Name: count, dtype: int64

Number of Numerical Features: 667
Number of Categorical Features: 2
Categorical Features: ['subject', 'state']
Numerical Features: ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29', 'x30', 'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39', 'x40', 'x41', 'x42', 'x43', 'x44', 'x45', 'x46', 'x47', 'x48', 'x49', 'x50', 'x51', 'x52', 'x53', 'x54', 'x55', 'x56', 'x57', 'x58', 'x59', 'x60', 'x61', 'x62', 'x63', 'x64', 'x65', 'x66', 'x67', 'x68', 'x69', 'x70', 'x71', 'x72', 'x73', 'x74', 'x75', 'x76', 'x77', 'x78', 'x79', 'x80', 'x81', 'x82', 'x83', 'x84', 'x85', 'x86', 'x87', 'x88', 'x89', 'x90', 'x91', 'x92', 'x93', 'x94', 'x95', 'x96', 'x97', 'x98', 'x99', 'x100', 'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x107', 'x108',

In [71]:
# Checking for missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
print("Missing Values in Each Column:\n", missing_df)


Missing Values in Each Column:
 Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


In [72]:
# # Convert categorical columns to numeric 
# df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True).astype(int)

# # add the target variable
# df_encoded['output'] = df['output']

# # Verify encoding
# df_encoded.head()


In [73]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# One-Hot Encoding for low cardinality columns using ColumnTransformer
# We'll drop the first category to avoid multicollinearity
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first'), categorical_cols)
    ],
    remainder='passthrough'  # Keep the other columns as is
)

# Apply transformations
df_encoded = preprocessor.fit_transform(df)

# Get new feature names after One-Hot Encoding
onehot_features = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_cols)
remaining_features = [col for col in df.columns if col not in categorical_cols]
new_feature_names = list(onehot_features) + remaining_features

# Convert to DataFrame
df_encoded = pd.DataFrame(df_encoded, columns=new_feature_names)

print(f"\nEncoded DataFrame Shape: {df_encoded.shape}")



Encoded DataFrame Shape: (4584, 682)


In [74]:
df_encoded.head()

Unnamed: 0,subject_B,subject_C,subject_D,subject_F,subject_G,subject_H,subject_I,subject_K,subject_L,subject_M,...,z215,z216,z217,z218,z219,z220,z221,z222,phase,output
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.021831,-0.006764,-0.00493,-0.005554,5.246375,-7.534092,3.530736,-0.539045,3.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025202,0.034347,0.022757,0.052506,-3.727741,-2.854443,-0.699268,-0.054074,4.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.116249,-0.10126,-0.086813,-0.101497,-7.510594,19.564182,-17.00813,4.945392,3.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.005236,0.018056,0.030856,-0.161398,-6.435819,2.174453,-0.153956,-0.003958,2.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.010804,-0.016153,-0.017226,-0.016454,-2.581403,3.011932,-1.281361,0.192647,2.0,1.0


In [76]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit and transform the numerical features
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

print("\nNumerical Features Scaled:")

df_encoded_std = df_encoded.copy()
df_encoded_std.head()



Numerical Features Scaled:


Unnamed: 0,subject_B,subject_C,subject_D,subject_F,subject_G,subject_H,subject_I,subject_K,subject_L,subject_M,...,z215,z216,z217,z218,z219,z220,z221,z222,phase,output
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.236024,-0.121422,-0.080378,-0.076299,0.259165,-0.174105,0.093737,-0.134397,0.414512,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.269934,0.464628,0.27262,0.560323,0.222343,-0.166277,0.086677,-0.13217,1.277421,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.153199,-1.468509,-1.124348,-1.128294,0.206821,-0.128778,0.059456,-0.109218,0.414512,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.069058,0.232393,0.375878,-1.7851,0.211231,-0.157866,0.087587,-0.13194,-0.448396,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.092322,-0.255271,-0.237144,-0.195813,0.227046,-0.156465,0.085705,-0.131038,-0.448396,1.0


## Split the Data

In [79]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Separate features and target variable
X = df_encoded.drop('output', axis=1)
y = df_encoded['output']

# Initialize StratifiedKFold for 5-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# print number of samples in each fold
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i+1}: {len(train_index)} training samples, {len(test_index)} testing samples")

Fold 1: 3667 training samples, 917 testing samples
Fold 2: 3667 training samples, 917 testing samples
Fold 3: 3667 training samples, 917 testing samples
Fold 4: 3667 training samples, 917 testing samples
Fold 5: 3668 training samples, 916 testing samples


### Baseline Logistic Regression

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
# silence the warning
import warnings
warnings.filterwarnings('ignore')

# Initialize Logistic Regression
log_reg = LogisticRegression(
    max_iter=1000, 
    random_state=42, 
    n_jobs=-1,
    solver='liblinear'
)

# Define scoring metric
scoring = make_scorer(accuracy_score)

# Perform 5-Fold Cross-Validation
logreg_cv_scores = cross_val_score(log_reg, X, y, cv=skf, scoring=scoring)

# Display Cross-Validation Scores
print(f"Logistic Regression CV Accuracy Scores: {logreg_cv_scores}")
print(f"Mean CV Accuracy: {logreg_cv_scores.mean():.4f}")
print(f"Standard Deviation: {logreg_cv_scores.std():.4f}")

Logistic Regression CV Accuracy Scores: [0.84514722 0.81897492 0.84950927 0.83424209 0.85371179]
Mean CV Accuracy: 0.8403
Standard Deviation: 0.0125


## Simple Random Forest

In [85]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Perform 5-Fold Cross-Validation
rf_cv_scores = cross_val_score(rf, X, y, cv=skf, scoring=scoring, n_jobs=-1)

# Display Cross-Validation Scores
print(f"Random Forest CV Accuracy Scores: {rf_cv_scores}")
print(f"Mean CV Accuracy: {rf_cv_scores.mean():.4f}")
print(f"Standard Deviation: {rf_cv_scores.std():.4f}")


Random Forest CV Accuracy Scores: [0.85932388 0.85278081 0.86041439 0.85387132 0.85371179]
Mean CV Accuracy: 0.8560
Standard Deviation: 0.0032


## Support Vector Machine (SVM) with 5-Fold Cross-Validation

In [87]:
from sklearn.svm import SVC

# Initialize SVM with RBF kernel
svm = SVC(kernel='rbf', random_state=42, probability=True)

# Perform 5-Fold Cross-Validation
svm_cv_scores = cross_val_score(svm, X, y, cv=skf, scoring=scoring, n_jobs=-1)

# Display Cross-Validation Scores
print(f"SVM CV Accuracy Scores: {svm_cv_scores}")
print(f"Mean CV Accuracy: {svm_cv_scores.mean():.4f}")
print(f"Standard Deviation: {svm_cv_scores.std():.4f}")


SVM CV Accuracy Scores: [0.84841876 0.84841876 0.84732824 0.84732824 0.84934498]
Mean CV Accuracy: 0.8482
Standard Deviation: 0.0008


### Gradient Boosting Classifier with 5-Fold Cross-Validation

In [88]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=42)

# Perform 5-Fold Cross-Validation
gbc_cv_scores = cross_val_score(gbc, X, y, cv=skf, scoring=scoring, n_jobs=-1)

# Display Cross-Validation Scores
print(f"Gradient Boosting CV Accuracy Scores: {gbc_cv_scores}")
print(f"Mean CV Accuracy: {gbc_cv_scores.mean():.4f}")
print(f"Standard Deviation: {gbc_cv_scores.std():.4f}")


  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


Gradient Boosting CV Accuracy Scores: [0.85605234 0.85496183 0.87131952 0.86041439 0.86026201]
Mean CV Accuracy: 0.8606
Standard Deviation: 0.0058


### XGBoost Classifier with 5-Fold Cross-Validation

In [91]:
import xgboost as xgb

# silence the warning
import warnings
warnings.filterwarnings('ignore')

# Initialize XGBoost Classifier
xgb_clf = xgb.XGBClassifier(eval_metric='logloss', random_state=42)

# Perform 5-Fold Cross-Validation
xgb_cv_scores = cross_val_score(xgb_clf, X, y, cv=skf, scoring=scoring, n_jobs=-1)

# Display Cross-Validation Scores
print(f"XGBoost CV Accuracy Scores: {xgb_cv_scores}")
print(f"Mean CV Accuracy: {xgb_cv_scores.mean():.4f}")
print(f"Standard Deviation: {xgb_cv_scores.std():.4f}")


XGBoost CV Accuracy Scores: [0.86259542 0.84841876 0.87241003 0.85496183 0.86572052]
Mean CV Accuracy: 0.8608
Standard Deviation: 0.0084


### Hyperparameter Tuning with GridSearchCV for XGBoost

In [94]:
from sklearn.model_selection import GridSearchCV

# silence the warning
import warnings
warnings.filterwarnings('ignore')

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Initialize XGBoost Classifier
xgb_tuned = xgb.XGBClassifier(eval_metric='logloss', random_state=42, n_jobs=-1)

# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(
    estimator=xgb_tuned,
    param_grid=param_grid_xgb,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Perform Grid Search
grid_search_xgb.fit(X, y)

# Best Parameters and Best Score
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_

print(f"Best Parameters for XGBoost: {best_params_xgb}")
print(f"Best Cross-Validation Accuracy: {best_score_xgb:.4f}")


Fitting 5 folds for each of 768 candidates, totalling 3840 fits


  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0; total time=   1.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0; total time=   1.4s
[CV] END

KeyboardInterrupt: 

In [96]:
t = pd.read_csv("test_data.csv")
t['phase'].value_counts()

phase
4    559
1    488
3    358
2    327
Name: count, dtype: int64