In [160]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

In [161]:
student_data=pd.read_csv("data/student_dataset/student-mat.csv")
student_data

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [162]:
student_data.drop(columns=["G1","G2","school","address"],inplace=True)
student_data.rename(columns={'G3':'target'},inplace=True)
student_data["target"]=student_data["target"].apply(lambda x: 1 if x>=10 else 0)

numeric_features=[i for i in student_data.columns if student_data[i].dtype in [np.int64, np.int64]]
categorical_features = [col for col in student_data.columns if col not in numeric_features]

feature_order=['sex', 'age', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'traveltime',
       'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities',
       'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime',
       'goout', 'Dalc', 'Walc', 'health', 'absences', 'guardian_father',
       'guardian_mother', 'guardian_other', 'Fjob_at_home', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course',
       'reason_home', 'reason_other', 'reason_reputation', 'Mjob_at_home',
       'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher','target']

In [163]:
randomstate=123

def encode_features(df, categorical_features):
    binary_mappings = {}
    for feature in categorical_features:
        unique_values = df[feature].nunique()
        if unique_values == 2:
            # Binary encoding
            df[feature] = df[feature].astype('category')
            mapping = dict(enumerate(df[feature].cat.categories))
            binary_mappings[feature] = {v: k for k, v in mapping.items()}
            df[feature] = df[feature].map(binary_mappings[feature])
            df[feature] = df[feature].astype('int')
        else:
            # One-hot encoding
            one_hot = pd.get_dummies(df[feature], prefix=feature, dtype="int")
            df = df.drop(feature, axis=1)
            df = pd.concat([df, one_hot], axis=1)
    return df, binary_mappings

# Apply encoding
student_data , binary_mappings= encode_features(student_data, categorical_features)

#Move target to end
target_col = student_data.pop("target")
student_data.insert(len(student_data.columns), "target", target_col)

#Create and save final train/test sets with target being the last column
student_data=student_data[feature_order]
student_train,student_test = train_test_split(student_data, test_size=0.2 , random_state=randomstate)

student_train.to_parquet("data/student_dataset/train_cleaned.parquet")
student_test.to_parquet("data/student_dataset/test_cleaned.parquet")

x_train = student_train.drop("target", axis=1)
y_train= student_train["target"]

x_test=student_test.drop("target", axis=1)
y_test=student_test["target"]

model = RandomForestClassifier(n_estimators=100, random_state=randomstate)
# model=GaussianNB()

model.fit(x_train, y_train)

target_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, target_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 78.48%


In [164]:
with open('data/student_dataset/RF.pkl', 'wb') as f:
    pickle.dump(model, f)

In [165]:
x_train.columns 

Index(['sex', 'age', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'traveltime',
       'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities',
       'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime',
       'goout', 'Dalc', 'Walc', 'health', 'absences', 'guardian_father',
       'guardian_mother', 'guardian_other', 'Fjob_at_home', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course',
       'reason_home', 'reason_other', 'reason_reputation', 'Mjob_at_home',
       'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher'],
      dtype='object')

In [166]:
feature_desc = [
    "The sex of the student as a binary variable (0: female, 1: male)",
    "The age of the student in years",
    "Size of the family of the student (0: greater than 3 , 1 : less than 3)",
    "Parents cohabitation status (0: living apart, 1: living together)",
    "Mother's education (0: none, 1: primary education (4th grade), 2: 5th to 9th grade, 3: secondary education or 4: higher education)",
    "Father's education (0: none, 1: primary education (4th grade), 2: 5th to 9th grade, 3: secondary education or 4: higher education)",
    "Home to school travel time (1: <15 min, 2: 15 to 30 min., 3: 30 min. to 1 hour, or 4: >1 hour)",
    "Weekly study time (1: <2 hours, 2: 2 to 5 hours, 3: 5 to 10 hours, or 4: >10 hours)",
    "Number of past class failures (from 0 to 3)",
    "Student receiving extra educational support (0: no, 1:yes)",
    "Student receiving family educational support (0: no, 1: yes)",
    "Student taking extra paid classes within the course subject (0: no, 1:yes)",
    "Stdent taking part in extra-curricular activities (0:no, 1:yes)",
    "Student has attended nursery school (0:no, 1:yes)",
    "Student wants to get higher education (0:no, 1:yes)",
    "Student has internet access at home (0:no, 1:yes)",
    "Student is in a romantic relationship (0:no, 1:yes)",
    "Quality of family relationships (numeric: from 1 - very bad to 5 - excellent)",
    "Free time after school (from 1 - very low to 5 - very high)",
    "Going out with friends (from 1 - very low to 5 - very high)",
    "Workday alcohol consumption (from 1 - very low to 5 - very high)",
    "Weekend alcohol consumption (from 1 - very low to 5 - very high)",
    "Current health status (from 1 - very bad to 5 - very good)",
    "Number of school absences (actual number of absences)",
    "One-hot variable for student's guardian -- father is guardian",
    "One-hot variable for student's guardian -- mother is guardian",
    "One-hot variable for student's guardian -- neither mother or father but someone else is guardian",
    "One-hot variable for father's job -- at home",
    "One-hot variable for father's job -- care related",
    "One-hot variable for father's job -- other",
    "One-hot variable for father's job -- civil services",
    "One-hot variable for father's job -- teacher",
    "One-hot variable for reason to choose this school -- chosen for course offer",
    "One-hot variable for reason to choose this school -- chosen due to proximity to home",
    "One-hot variable for reason to choose this school -- chosen for some other reason",
    "One-hot variable for reason to choose this school -- chosen for school reputation",
    "One-hot variable for mothers's job -- at home",
    "One-hot variable for mothers's job -- care related",
    "One-hot variable for mothers's job -- other",
    "One-hot variable for mothers's job -- civil services",
    "One-hot variable for mothers's job -- teacher",
    ]

feature_desc_df = pd.DataFrame({
    "feature_name": list(x_test.columns),
    "feature_average": x_train.mean().to_list() ,
    "feature_desc": feature_desc,
})

dataset_description="The dataset contains information about students from two Portugese high schools and in particular their family situation and other habits"
target_description="The target variable represents the final year grade, transformed into whether the student passed (1) or not (0) at the end of the year"
task_description="Predict whether a student will pass"

dataset_info={
 "dataset_description": dataset_description,
 "target_description": target_description,
 "task_description": task_description,
 "feature_description": feature_desc_df
 }


with open('data/student_dataset/dataset_info', 'wb') as f:
    pickle.dump(dataset_info, f)

In [167]:
feature_desc_df 

Unnamed: 0,feature_name,feature_average,feature_desc
0,sex,0.458861,The sex of the student as a binary variable (0...
1,age,16.756329,The age of the student in years
2,famsize,0.294304,Size of the family of the student (0: greater ...
3,Pstatus,0.892405,"Parents cohabitation status (0: living apart, ..."
4,Medu,2.75,"Mother's education (0: none, 1: primary educat..."
5,Fedu,2.515823,"Father's education (0: none, 1: primary educat..."
6,traveltime,1.458861,"Home to school travel time (1: <15 min, 2: 15 ..."
7,studytime,2.050633,"Weekly study time (1: <2 hours, 2: 2 to 5 hour..."
8,failures,0.360759,Number of past class failures (from 0 to 3)
9,schoolsup,0.139241,Student receiving extra educational support (0...
