Information about the Dataset:


## **1.Data Preprocessing and Analysis**

In [3]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer


In [4]:
pwd

'/content'

In [6]:
# Step 1: Load the dataset
students = pd.read_csv('students.csv')
teachers = pd.read_csv('teachers.csv')
parents = pd.read_csv('parents.csv')
scores = pd.read_csv('scores.csv')

In [None]:
# Step 2: Filter students and scores datasets for SS3 students only
ss3_students_df = students_df[students_df['StudentID'].str.startswith('SS3')]
ss3_scores_df = scores_df[scores_df['StudentID'].str.startswith('SS3')]

In [None]:
# Step 3: Data cleaning
# Check for missing values
print(ss3_students_df.isnull().sum())
print(ss3_scores_df.isnull().sum())

# Handle missing values (e.g., fill with median or drop rows)
ss3_students_df.fillna(ss3_students_df.median(), inplace=True)
ss3_scores_df.fillna(ss3_scores_df.median(), inplace=True)

In [None]:
# Some analysis:
#Combine student data with scores data for analysis
merged_df = pd.merge(ss3_students_df, ss3_scores_df, on='StudentID')

# Find correlation between numeric variables
correlation_matrix = merged_df.corr()

# Plot heatmap of correlation
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
#Engineering:
# Calculate average internal and mock exam scores
merged_df['Average_Internal_Score'] = merged_df[['English Language', 'Physics', 'Chemistry', 'Biology']].mean(axis=1)
merged_df['Average_Mock_Score'] = merged_df[['Mock English Language', 'Mock Physics', 'Mock Chemistry', 'Mock Biology']].mean(axis=1)

# Feature engineering for attendance percentage
merged_df['Attendance_Percentage'] = merged_df['Attendance Record'] / 100

## **2.Model Training & Evaluation**

In [None]:
# Define target (pass/fail based on internal total score)
merged_df['Pass/Fail'] = np.where(merged_df['Internal Exam Total(JAMB)'] >= 200, 1, 0)

# Features and target
X = merged_df[['Attendance_Percentage', 'Extra Curricular Activities', 'Library Hours', 'Average_Internal_Score', 'Average_Mock_Score']]
y = merged_df['Pass/Fail']

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
sns.heatmap(conf_matrix, annot=True, fmt="d")
plt.show()

### **3.Model Deployment**

In [None]:
import joblib

# Save the trained model
joblib.dump(model, 'student_performance_model.pkl')

# To load the model later
# loaded_model = joblib.load('student_performance_model.pkl')