In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
file = r"C:\Users\AKIN-JOHNSON\Desktop\Student performance data\student_exam_performance_dataset.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,student_id,gender,age,class,math_score,english_score,class_attendance,study_hours_per_week,preferred_study_method,parent_education,internet_access,students_jamb_score,parental_income,health_issues,extracurricular_participation
0,1,Male,16,SS3,87,51,80,20,Tutorials,Primary,No,298,Medium,Yes,No
1,2,Male,18,SS3,89,78,86,14,Group Study,Secondary,No,343,Medium,Yes,Yes
2,3,Female,16,SS3,95,91,72,13,Tutorials,Primary,No,317,High,No,No
3,4,Male,18,SS3,57,40,100,6,Tutorials,Tertiary,No,153,Low,No,No
4,5,Male,18,SS3,86,54,73,17,Individual,Tertiary,No,270,High,No,Yes


In [5]:
# having a copy of this dataset
df1 = df.copy()

In [7]:
# creating a target column for the dataset
df['outcome'] = np.where(df['students_jamb_score'] > 199, 'pass', 'fail')
df.head()

Unnamed: 0,student_id,gender,age,class,math_score,english_score,class_attendance,study_hours_per_week,preferred_study_method,parent_education,internet_access,students_jamb_score,parental_income,health_issues,extracurricular_participation,outcome
0,1,Male,16,SS3,87,51,80,20,Tutorials,Primary,No,298,Medium,Yes,No,pass
1,2,Male,18,SS3,89,78,86,14,Group Study,Secondary,No,343,Medium,Yes,Yes,pass
2,3,Female,16,SS3,95,91,72,13,Tutorials,Primary,No,317,High,No,No,pass
3,4,Male,18,SS3,57,40,100,6,Tutorials,Tertiary,No,153,Low,No,No,fail
4,5,Male,18,SS3,86,54,73,17,Individual,Tertiary,No,270,High,No,Yes,pass


### Machine Learning

In [9]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [33]:
df.head(10)

Unnamed: 0,student_id,gender,age,class,math_score,english_score,class_attendance,study_hours_per_week,preferred_study_method,parent_education,internet_access,students_jamb_score,parental_income,health_issues,extracurricular_participation,outcome
0,1,Male,16,SS3,87,51,80,20,2,0,0,298,2,1,0,1
1,2,Male,18,SS3,89,78,86,14,0,1,0,343,2,1,1,1
2,3,Female,16,SS3,95,91,72,13,2,0,0,317,0,0,0,1
3,4,Male,18,SS3,57,40,100,6,2,2,0,153,1,0,0,0
4,5,Male,18,SS3,86,54,73,17,1,2,0,270,0,0,1,1
5,6,Male,18,SS3,96,56,75,8,0,0,0,317,0,0,1,1
6,7,Male,16,SS3,47,78,100,10,1,2,0,281,2,1,0,1
7,8,Male,16,SS3,89,91,94,6,0,2,0,344,1,0,0,1
8,9,Female,17,SS3,97,100,87,11,1,0,1,188,1,1,0,0
9,10,Male,16,SS3,41,59,87,13,2,1,0,251,2,0,0,1


In [13]:
# 1. Preprocessing
# Encode categorical variables using LabelEncoder or OneHotEncoder where applicable
le = LabelEncoder()
df['preferred_study_method'] = le.fit_transform(df['preferred_study_method'])
df['parent_education'] = le.fit_transform(df['parent_education'])
df['internet_access'] = le.fit_transform(df['internet_access'])
df['extracurricular_participation'] = le.fit_transform(df['extracurricular_participation'])
df['parental_income'] = le.fit_transform(df['parental_income'])
df['health_issues'] = le.fit_transform(df['health_issues'])
df['outcome'] = le.fit_transform(df['outcome'])

# Separate Features (X) and Target (y)
X = df.drop(columns=['student_id','outcome','age','class','gender'])  # important feature variables
y = df['outcome']  # Target variable

X.head(5)

Unnamed: 0,math_score,english_score,class_attendance,study_hours_per_week,preferred_study_method,parent_education,internet_access,students_jamb_score,parental_income,health_issues,extracurricular_participation
0,87,51,80,20,2,0,0,298,2,1,0


In [35]:
X.head(10)

Unnamed: 0,math_score,english_score,class_attendance,study_hours_per_week,preferred_study_method,parent_education,internet_access,students_jamb_score,parental_income,health_issues,extracurricular_participation
0,87,51,80,20,2,0,0,298,2,1,0
1,89,78,86,14,0,1,0,343,2,1,1
2,95,91,72,13,2,0,0,317,0,0,0
3,57,40,100,6,2,2,0,153,1,0,0
4,86,54,73,17,1,2,0,270,0,0,1
5,96,56,75,8,0,0,0,317,0,0,1
6,47,78,100,10,1,2,0,281,2,1,0
7,89,91,94,6,0,2,0,344,1,0,0
8,97,100,87,11,1,0,1,188,1,1,0
9,41,59,87,13,2,1,0,251,2,0,0


In [9]:
'''# Scale features that are numerical and need scaling
scaler = StandardScaler()
X[['math_score', 'english_score', 'class_attendance','study_hours_per_week', 'students_jamb_score', 'parental_income']] = scaler.fit_transform(X[['age', 'math_score', 'english_score', 'class_attendance', 
                                                                                                'study_hours_per_week', 'students_jamb_score', 'parental_income']])'''

"# Scale features that are numerical and need scaling\nscaler = StandardScaler()\nX[['math_score', 'english_score', 'class_attendance','study_hours_per_week', 'students_jamb_score', 'parental_income']] = scaler.fit_transform(X[['age', 'math_score', 'english_score', 'class_attendance', \n                                                                                                'study_hours_per_week', 'students_jamb_score', 'parental_income']])"

In [15]:
# 2. Train-test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((800, 11), (200, 11))

In [17]:
# 3. Model Initialization
models = {
    'Logistic Regression': LogisticRegression(C=100, solver='liblinear'),
    'Support Vector Classifier': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Extra Tree': ExtraTreeClassifier(),
    'K-Neighbors': KNeighborsClassifier(metric='manhattan', n_neighbors=9),
    'Naive Bayes': GaussianNB()
}

In [19]:
# 4. Cross-validation and Training
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    cv_scores = cross_val_score(model, X_train, y_train, cv=4)
    model.fit(X_train, y_train)  # Training the model
    y_test_pred = model.predict(X_test)  # Prediction on test set
    
    # Store Results
    results[name] = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'cv_score_mean': cv_scores.mean(),
        'classification_report': classification_report(y_test, y_test_pred)
    }

Training Logistic Regression...
Training Support Vector Classifier...
Training Random Forest...
Training Gradient Boosting...
Training AdaBoost...
Training Decision Tree...
Training Extra Tree...
Training K-Neighbors...
Training Naive Bayes...


In [21]:
# 5. Display Results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Cross-Validation Mean Accuracy: {result['cv_score_mean']}")
    print(f"Test Accuracy: {result['accuracy']}")
    print(f"Classification Report:\n{result['classification_report']}")
    print('-' * 50)

Model: Logistic Regression
Cross-Validation Mean Accuracy: 0.9825
Test Accuracy: 0.985
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97        53
           1       0.99      0.99      0.99       147

    accuracy                           0.98       200
   macro avg       0.98      0.98      0.98       200
weighted avg       0.98      0.98      0.98       200

--------------------------------------------------
Model: Support Vector Classifier
Cross-Validation Mean Accuracy: 0.9887499999999999
Test Accuracy: 0.99
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        53
           1       1.00      0.99      0.99       147

    accuracy                           0.99       200
   macro avg       0.98      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200

--------------------------------------------------
Model: 

In [23]:
import pickle

# Save Logistic Regression and K-Neighbors models
with open('Jamb_pred_lr.sav', 'wb') as log_file:
    pickle.dump(models['Logistic Regression'], log_file)

with open('Jamb_pred_knn.sav', 'wb') as knn_file:
    pickle.dump(models['K-Neighbors'], knn_file)

print("Models saved successfully!")

Models saved successfully!


In [25]:
# Load the Logistic Regression model
with open('Jamb_pred_lr.sav', 'rb') as log_file:
    logistic_model = pickle.load(log_file)

# Load the K-Neighbors model
with open('Jamb_pred_knn.sav', 'rb') as knn_file:
    knn_model = pickle.load(knn_file)

print("Models loaded successfully!")


Models loaded successfully!


In [31]:
# testing the models
logistic_model.predict([[57,40,100,6,2,2,0,153,1,0,0]])

array([0])

In [17]:
y

0      1
1      1
2      1
3      0
4      1
      ..
995    1
996    1
997    0
998    1
999    1
Name: outcome, Length: 1000, dtype: int32

### Performing hyperparameter tuning

In [19]:
# 1. Define hyperparameter grids for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['liblinear', 'lbfgs']
    },
    'Support Vector Classifier': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 10],
        'subsample': [0.8, 1.0]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    },
    'Decision Tree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'Extra Tree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'K-Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    },
    'Naive Bayes': {
        # Naive Bayes doesn't have many hyperparameters to tune
        'var_smoothing': [1e-09, 1e-08, 1e-07]
    }
}

# 2. Model Initialization
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Classifier': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Extra Tree': ExtraTreeClassifier(),
    'K-Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

In [20]:
# 3. Perform GridSearchCV for each model
best_models = {}
best_params = {}
for name, model in models.items():
    print(f"Performing GridSearchCV for {name}...")
    param_grid = param_grids[name]  # Get the parameter grid for the model
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)  # Fit the model with training data
    
    # Store the best model and the best parameters
    best_models[name] = grid_search.best_estimator_
    best_params[name] = grid_search.best_params_
    
    # Print the best parameters for each model
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best accuracy for {name}: {grid_search.best_score_}")
    print('-' * 50)


Performing GridSearchCV for Logistic Regression...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters for Logistic Regression: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best accuracy for Logistic Regression: 0.9800000000000001
--------------------------------------------------
Performing GridSearchCV for Support Vector Classifier...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for Support Vector Classifier: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Best accuracy for Support Vector Classifier: 0.9974999999999999
--------------------------------------------------
Performing GridSearchCV for Random Forest...
Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

In [None]:
best_models

In [None]:
best_params

In [None]:
# 4. Evaluate all best models on the test set
print("Evaluating the best models on the test set...")
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy for {name}: {accuracy}")
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")
    print('-' * 50)


In [None]:
lr = LogisticRegression(C=100, solver='liblinear')
knn = KNeighborsClassifier(metric='manhattan', n_neighbors=9)
lr.fit(X_train, y_train)
knn.fit(X_train, y_train)

knn.predict([[87,51,80,20,2,0,0,280,2,1,0]])

In [None]:
import pickle
# Save the LogisticRegression model
with open('Jamb_pred_lr.sav', 'wb') as file:
    pickle.dump(lr, file)

# Load the model
with open('Jamb_pred_lr.sav', 'rb') as file:
    lr_model = pickle.load(file)

In [None]:
# Save the KNN model
with open('Jamb_pred_knn.sav', 'wb') as file:
    pickle.dump(knn, file)

# Load the model
with open('Jamb_pred_knn.sav', 'rb') as file:
    knn_model = pickle.load(file)

In [None]:
# 4. Save Logistic Regression and K-Neighbors models using pickle
with open('Jamb_pred_lr.sav', 'wb') as file:
    pickle.dump(best_models['Logistic Regression'], file)

with open('Jamb_pred_knn.sav', 'wb') as file:
    pickle.dump(best_models['K-Neighbors'], file)

print("Models saved successfully.")

In [None]:
with open('Jamb_pred_knn.sav', 'rb') as file:
    knn_model = pickle.load(file)

knn_model.predict([[87,51,80,20,2,0,0,280,2,1,0]])

In [None]:
X.columns

In [None]:
X['extracurricular_participation'].unique()

In [None]:
df1['extracurricular_participation'].unique()