In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Data Set 2: Student Performance: Portuguese Course

Attributes for student-por.csv (Portuguese language course) datasets:

1 school - student's school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)

2 sex - student's sex (binary: "F" - female or "M" - male)

3 age - student's age (numeric: from 15 to 22)

4 address - student's home address type (binary: "U" - urban or "R" - rural)

5 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)

6 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)

7 Medu - mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)

8 Fedu - father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)

9 Mjob - mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")

10 Fjob - father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")

11 reason - reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other")

12  guardian - student's guardian (nominal: "mother", "father" or "other")

13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)

14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)

15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)

16 schoolsup - extra educational support (binary: yes or no)

17 famsup - family educational support (binary: yes or no)

18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)

19 activities - extra-curricular activities (binary: yes or no)

20 nursery - attended nursery school (binary: yes or no)

21 higher - wants to take higher education (binary: yes or no)

22 internet - Internet access at home (binary: yes or no)

23 romantic - with a romantic relationship (binary: yes or no)

24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)

25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)

26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)

27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)

28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)

29 health - current health status (numeric: from 1 - very bad to 5 - very good)

30 absences - number of school absences (numeric: from 0 to 93)

31 G1 - first period grade (numeric: from 0 to 20)

31 G2 - second period grade (numeric: from 0 to 20)

32 G3 - final grade (numeric: from 0 to 20, output target)

# Task
Our task is to find out the final grades of students by the features above.

In [3]:
ds2 = pd.read_csv('student-por.csv', sep = ";")
ds2

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,...,5,4,2,1,2,5,4,10,11,10
645,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
646,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
647,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


In [4]:
ds2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      649 non-null    object
 1   sex         649 non-null    object
 2   age         649 non-null    int64 
 3   address     649 non-null    object
 4   famsize     649 non-null    object
 5   Pstatus     649 non-null    object
 6   Medu        649 non-null    int64 
 7   Fedu        649 non-null    int64 
 8   Mjob        649 non-null    object
 9   Fjob        649 non-null    object
 10  reason      649 non-null    object
 11  guardian    649 non-null    object
 12  traveltime  649 non-null    int64 
 13  studytime   649 non-null    int64 
 14  failures    649 non-null    int64 
 15  schoolsup   649 non-null    object
 16  famsup      649 non-null    object
 17  paid        649 non-null    object
 18  activities  649 non-null    object
 19  nursery     649 non-null    object
 20  higher    

In [5]:
str_cols = ds2.select_dtypes(['object']).columns
str_cols

Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
       'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities',
       'nursery', 'higher', 'internet', 'romantic'],
      dtype='object')

### Binary Transformation

Binary Transformation --> Yes - 1 & No - 0

In [6]:
YN_list = ['schoolsup', 'famsup', 'paid', 'activities',
       'nursery', 'higher', 'internet', 'romantic']
for cols in YN_list:
    ds2[cols] = pd.Series(np.where(ds2[cols] == 'yes', 1, 0), ds2.index)

In [7]:
ds2[YN_list].head()

Unnamed: 0,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,1,0,0,0,1,1,0,0
1,0,1,0,0,0,1,1,0
2,1,0,0,0,1,1,1,0
3,0,1,0,1,1,1,1,1
4,0,1,0,0,1,1,0,0


In [8]:
ds2['school'] = pd.Series(np.where(ds2['school'] == 'GP', 1, 0), ds2.index)
ds2['address'] = pd.Series(np.where(ds2['address'] == 'U', 1, 0), ds2.index)
ds2['famsize'] = pd.Series(np.where(ds2['famsize'] == 'GE3', 1, 0), ds2.index)
ds2['Pstatus'] = pd.Series(np.where(ds2['Pstatus'] == 'T', 1, 0), ds2.index)

In [9]:
ds2['sex'] = pd.Series(np.where(ds2['sex'] == 'F', 1, 0), ds2.index)

In [10]:
ds2[['school','sex','address','famsize','Pstatus']].head()

Unnamed: 0,school,sex,address,famsize,Pstatus
0,1,1,1,0,0
1,1,1,1,0,1
2,1,1,1,0,1
3,1,1,1,0,1
4,1,1,1,0,1


### Dummy Variable Transformation

In [11]:
#Creating a new dataframe to concat new numerical columns on. 
categoricColumns = ['Mjob', 'Fjob','reason', 'guardian']
dummy_ds2 = pd.DataFrame()
#By using a loop concating all columns in a df
for var in categoricColumns:
    dummy_ds2 = pd.concat([dummy_ds2, pd.get_dummies(ds2[var], prefix=var)], axis=1)
dummy_ds2.head()

Unnamed: 0,Mjob_at_home,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,guardian_father,guardian_mother,guardian_other
0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0
2,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
3,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0
4,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0


In [12]:
all_cols = ds2.columns
all_cols

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [13]:
bin_list = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
            'traveltime', 'studytime','failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
           'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
           'Walc', 'health', 'absences', 'G1', 'G2', 'G3']

In [14]:
new_ds2 = pd.concat([dummy_ds2, ds2[bin_list]], axis=1)
new_ds2

Unnamed: 0,Mjob_at_home,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,1,0,0,0,0,0,0,0,0,1,...,4,3,4,1,1,3,4,0,11,11
1,1,0,0,0,0,0,0,1,0,0,...,5,3,3,1,1,3,2,9,11,11
2,1,0,0,0,0,0,0,1,0,0,...,4,3,2,2,3,3,6,12,13,12
3,0,1,0,0,0,0,0,0,1,0,...,3,2,2,1,1,5,0,14,14,14
4,0,0,1,0,0,0,0,1,0,0,...,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,0,0,0,1,0,0,0,1,0,0,...,5,4,2,1,2,5,4,10,11,10
645,0,0,0,0,1,0,0,0,1,0,...,4,3,4,1,1,1,4,15,15,16
646,0,0,1,0,0,0,0,1,0,0,...,1,1,1,1,1,5,6,11,12,9
647,0,0,0,1,0,0,0,0,1,0,...,2,4,5,3,4,2,6,10,10,10


## Train and Test Data

In [15]:
X_train, X_test, y_train, y_test = train_test_split(new_ds2.drop('G3', axis=1), new_ds2['G3'], test_size=0.35, random_state=42)

## Selected Parameters

In [48]:
param_grid_sgb = dict(max_depth = [2,3,5,7,10],
                  learning_rate = [0.1, 0.2, 0.5, 0.7],
                  n_estimators = [10,30,50,100,200])
param_grid_tree = dict(ccp_alpha = [0.001, 0.002, 0.003, 0.004], 
                       min_samples_leaf = [2,3,5,7,10])
param_grid_rf = dict(n_estimators = [500],
                     min_samples_leaf = [5],
                     max_features = [5,10,15,20,25,30])

In [17]:
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)

# Classifiers and Regressors

#### Lasso

In [18]:
#Lasso
lasso_reg = LassoCV(cv = kfold)
lasso_reg = lasso_reg.fit(np.array(X_train), np.array(y_train))
lasso_reg.alpha_

0.20390804641051083

#### Decision Tree

In [20]:
#tree_clf = DecisionTreeClassifier()
#grid_search_tree_clf = GridSearchCV(tree_clf, param_grid_tree, cv = kfold )
#results_tree_clf = grid_search_tree_clf.fit(np.array(X_train), np.array(y_train))

tree_reg = DecisionTreeRegressor()
grid_search_tree_reg = GridSearchCV(tree_reg, param_grid_tree, cv = kfold)
results_tree_reg = grid_search_tree_reg.fit(np.array(X_train), np.array(y_train))


#### Random Forest

In [49]:
#rf_clf = RandomForestClassifier()
#grid_search_rf_clf = GridSearchCV(rf_clf, param_grid_rf, cv = kfold )
#results_rf_clf = grid_search_rf_clf.fit(np.array(X_train), np.array((y_train)))

rf_reg = RandomForestRegressor()
grid_search_rf_reg = GridSearchCV(rf_reg, param_grid_rf, cv = kfold)
results_rf_reg = grid_search_rf_reg.fit(np.array(X_train), np.array(y_train))


#### Stochastic Gradient Boosting

In [22]:
#sgb_classifier = xgboost.XGBClassifier(min_child_weight=10, verbosity = 0)
#grid_search_clf = GridSearchCV(sgb_classifier, param_grid_sgb, cv = kfold)
#results_sgb_clf = grid_search_clf.fit(np.array(X_train), np.array(y_train))

sgb_regressor = xgboost.XGBRegressor(min_child_weight=10)
grid_search = GridSearchCV(sgb_regressor, param_grid_sgb, cv = kfold )
results_sgb_reg = grid_search.fit(np.array(X_train), np.array(y_train))

### Best Parameters

In [50]:
#Classification
print("Best parameters of \n")
#print("Alpha Value of Lasso: \n{}\n".format(lasso_reg.alpha_))
#print("Best parameters of Decision Tree: \n{}\n".format(results_tree_clf.best_params_))
#print("Best parameters of Random Tree: \n{}\n".format(results_rf_clf.best_params_))
#print("Best parameters of Stochastic Gradient Boosting: \n{}\n".format(results_sgb_clf.best_params_))

#Regression
print("Alpha Value of Lasso: \n{}\n".format(lasso_reg.alpha_))
print("Best parameters of Decision Tree: \n{}\n".format(results_tree_reg.best_params_))
print("Best parameters of Random Tree: \n{}\n".format(results_rf_reg.best_params_))
print("Best parameters of Stochastic Gradient Boosting: \n{}\n".format(results_sgb_reg.best_params_))

Best parameters of 

Alpha Value of Lasso: 
0.20390804641051083

Best parameters of Decision Tree: 
{'ccp_alpha': 0.002, 'min_samples_leaf': 7}

Best parameters of Random Tree: 
{'max_features': 30, 'min_samples_leaf': 5, 'n_estimators': 500}

Best parameters of Stochastic Gradient Boosting: 
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 50}



### Mean Absolute Errors, R2 Score and Best Score

In [25]:
from sklearn.metrics import mean_absolute_error

In [51]:
print("Lasso: Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, lasso_reg.predict(np.array(X_test)))))
print("DT: Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, results_tree_reg.best_estimator_.predict(np.array(X_test)))))
print("RF: Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, results_rf_reg.best_estimator_.predict(np.array(X_test)))))
print("SGB: Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, results_sgb_reg.best_estimator_.predict(np.array(X_test)))))


Lasso: Mean absolute error of the prediction is: 0.72767090793991
DT: Mean absolute error of the prediction is: 0.8166004317320106
RF: Mean absolute error of the prediction is: 0.7503522937273416
SGB: Mean absolute error of the prediction is: 0.7771657407283783


In [53]:
print("Lasso")
print("R2 Score: {}\n".format(r2_score(lasso_reg.predict(np.array(X_test)), y_test)))
#print("Best Score: {} \n".format(lasso_reg.best_score_))

print("Decision Tree")
print("R2 Score: {}".format(r2_score(results_tree_reg.best_estimator_.predict(np.array(X_test)), y_test)))
print("Best Score: {} \n".format(results_tree_reg.best_score_))

print("Random Forest")
print("R2 Score: {}".format(r2_score(results_rf_reg.best_estimator_.predict(np.array(X_test)), y_test)))
print("Best Score: {} \n".format(results_rf_reg.best_score_))

print("Stochastic Gradient Boosting")
print("R2 Score: {}".format(r2_score(results_sgb_reg.best_estimator_.predict(np.array(X_test)), y_test)))
print("Best Score: {} \n".format(results_sgb_reg.best_score_))

Lasso
R2 Score: 0.8742293821073173

Decision Tree
R2 Score: 0.827589278357343
Best Score: 0.7959080296350035 

Random Forest
R2 Score: 0.8411498013498049
Best Score: 0.8273011496719818 

Stochastic Gradient Boosting
R2 Score: 0.8286900864995175
Best Score: 0.8171186356063839 



# Comment Section

In the regression data set, to predict the grade of the student, I chose Mean Absolute Error as the performance metric of the model. Since the all of grades are between 0 and 20, the error is very small. However, Lasso is the best regressor with 0.72 value for this particular dataset.

There is not underfitting or overfitting in all of the models as there is not any R2 scores less than %75 and greater than %95. Lasso is the best regressor which explains the data better than others.

Second best model is the Random forest compared to Decision Tree and SGB. However, all of them can be tuned by the parameter to give better results.