In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load data
data = pd.read_csv('student_performance_dataset.csv')

# Step 1: Remove duplicates
data = data.drop_duplicates(subset='Student_ID')

# Step 2: Drop irrelevant columns
data = data.drop(['Student_ID', 'Final_Exam_Score'], axis=1)

# Define features (X) and target (y)
X = data.drop('Pass_Fail', axis=1)
y = data['Pass_Fail']

# Step 3: Encode categorical variables
categorical_features = ['Gender', 'Parental_Education_Level', 'Internet_Access_at_Home', 'Extracurricular_Activities']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ], remainder='passthrough')

X_processed = preprocessor.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Initialize and train Gradient Boosting model
gb_clf = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
gb_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = gb_clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.80

Classification Report:
               precision    recall  f1-score   support

        Fail       0.88      0.85      0.86        75
        Pass       0.59      0.64      0.62        25

    accuracy                           0.80       100
   macro avg       0.73      0.75      0.74       100
weighted avg       0.81      0.80      0.80       100

