### Import libraries and load the dataset

In [1]:
import pandas as pd

df = pd.read_csv("C:/Users/User/Downloads/StudentsPerformance.csv")
print("First 5 rows:")
print(df.head())


First 5 rows:
   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  


### Step : 2 Define target and features 

In [2]:
df['math_pass'] = (df['math score'] >= 70).astype(int)
X = df.drop(['math score', 'math_pass'], axis=1)
y = df['math_pass']

print("Target distribution:")
print(y.value_counts())

Target distribution:
math_pass
0    591
1    409
Name: count, dtype: int64


### Step 3 : Preprocessing pipeline

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categorical_columns = X.select_dtypes(include='object').columns
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
])

print("Categorical columns being encoded:")
print(categorical_columns.tolist())

Categorical columns being encoded:
['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']


### Step 4 : Train and test split

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Training size:", X_train.shape)
print("Test size:", X_test.shape)


Training size: (800, 7)
Test size: (200, 7)


### Step 5 : Train and evaluate multiple models

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True)
}

results = []

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]

    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_proba)
    })

results_df = pd.DataFrame(results).sort_values(by='F1 Score', ascending=False)
print("Model Comparison Results:")
print(results_df)


Model Comparison Results:
                    Model  Accuracy  F1 Score   ROC-AUC
0     Logistic Regression     0.685  0.519084  0.714810
1           Random Forest     0.610  0.518519  0.614820
2  Support Vector Machine     0.630  0.493151  0.658743


### Step 6 : Hyperparameter tuning with gridsearchCV

In [6]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [50, 100],
    'model__max_depth': [None, 10, 20]
}

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)
print("Best Parameters:", grid.best_params_)
print("Best F1 Score from GridSearchCV:", grid.best_score_)

Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 50}
Best F1 Score from GridSearchCV: 0.49838438181108247
