In [1]:
import pandas as pd
import numpy as np
np.random.seed(17313) # fix a random seed to ensure reproducible results

df = pd.read_csv('data/student-mat.csv', sep=';')

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Becasue we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [2]:
include = ['health', 'absences','age','G2','G3']
df.drop(columns=df.columns.difference(include), inplace=True)
df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df.drop(columns='G3', inplace=True) # drop the G3 score

In [3]:
from sklearn.ensemble import RandomForestClassifier as rf
import sklearn
from sklearn.model_selection import train_test_split
dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2)
clf = rf(n_estimators = 1000)
clf.fit(x_train, y_train)

RandomForestClassifier(n_estimators=1000)

In [4]:
pred = clf.predict(x_test)
print(f"Improved model F1 score on the test set: {sklearn.metrics.f1_score(y_test, pred, average='binary')}")
print(f"Improved model accuracy on the test set: {sklearn.metrics.accuracy_score(y_test, pred)}")
print(f"Improved model log loss on the test set: {sklearn.metrics.log_loss(y_test, pred)}")

Improved model F1 score on the test set: 0.9285714285714286
Improved model accuracy on the test set: 0.9746835443037974
Improved model log loss on the test set: 0.874409523889261


We get a much better f1 score!

In [5]:
import joblib
joblib.dump(clf, 'dockerfile/apps/improved_model.pkl')

['dockerfile/apps/improved_model.pkl']

In [6]:
query_df = pd.DataFrame({ 'age' : pd.Series(20) ,'health' : pd.Series(12) ,'absences' : pd.Series(10),'G2' : pd.Series(11)})
pred = clf.predict(query_df)
pred

Feature names must be in the same order as they were in fit.



array([1])