In [106]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

MAX_ROWS = 10
pd.set_option('display.max_rows', MAX_ROWS)
pd.set_option('display.max_columns', 200)
 
sns.set_style("whitegrid")
sns.set_context("paper")

plt.rcParams['figure.figsize'] = (20,10)

In [107]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
path_dataset = 'StudentsPerformance.csv'
df = pd.read_csv(path_dataset)

In [108]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [109]:
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])

df.gender.unique()

array([0, 1], dtype=int64)

In [110]:
le = LabelEncoder()
df['race/ethnicity'] = le.fit_transform(df['race/ethnicity'])

df['race/ethnicity'].unique()

array([1, 2, 0, 3, 4], dtype=int64)

In [111]:
le = LabelEncoder()
df['parental level of education'] = le.fit_transform(df['parental level of education'])

df['parental level of education'].unique()

array([1, 4, 3, 0, 2, 5], dtype=int64)

In [112]:
le = LabelEncoder()
df['lunch'] = le.fit_transform(df['lunch'])

df['lunch'].unique()

array([1, 0], dtype=int64)

In [113]:
le = LabelEncoder()
df['test preparation course'] = le.fit_transform(df['test preparation course'])

df['test preparation course'].unique()

array([1, 0], dtype=int64)

In [114]:
#df['meanScore'] =  (df['math score'] + df['reading score'] + df['writing score']) / 3

In [115]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,1,1,1,1,72,72,74
1,0,2,4,1,0,69,90,88
2,0,1,3,1,1,90,95,93
3,1,0,0,0,1,47,57,44
4,1,2,4,1,1,76,78,75


In [116]:
df.corr()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
gender,1.0,-0.002,0.002,0.021,-0.006,0.168,-0.244,-0.301
race/ethnicity,-0.002,1.0,-0.032,0.047,-0.018,0.216,0.145,0.166
parental level of education,0.002,-0.032,1.0,0.006,-0.024,-0.068,-0.072,-0.084
lunch,0.021,0.047,0.006,1.0,0.017,0.351,0.23,0.246
test preparation course,-0.006,-0.018,-0.024,0.017,1.0,-0.178,-0.242,-0.313
math score,0.168,0.216,-0.068,0.351,-0.178,1.0,0.818,0.803
reading score,-0.244,0.145,-0.072,0.23,-0.242,0.818,1.0,0.955
writing score,-0.301,0.166,-0.084,0.246,-0.313,0.803,0.955,1.0


In [117]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [118]:
x = df.drop(['parental level of education'], axis=1)
y = df['parental level of education']

# Realizá la separación a continuación en esta celda
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.20, stratify=y)

### Decision Tree

In [119]:
regressor = DecisionTreeClassifier(random_state=0, max_depth=12)
regressor.fit(x_test, y_test)
    
y_pred = regressor.predict(x_test)

In [120]:
accuracy_score(y_test, y_pred)

0.825

In [121]:
f1_score(y_test, y_pred, average='macro')

0.8455046081202983

In [122]:
input = [[1,2,1,1,76,78,75]]
pred = regressor.predict(input)
pred

array([4], dtype=int64)

## Random Forest

In [123]:
clf = RandomForestClassifier(n_estimators=10, max_depth=7, random_state=0)
clf.fit(x_test, y_test)

print(clf.feature_importances_)

[0.02965579 0.12310665 0.03922781 0.03594473 0.26672252 0.26368828
 0.24165423]


In [124]:
y_pred = clf.predict(x_test)

In [125]:
accuracy_score(y_test, y_pred)

0.755

In [126]:
f1_score(y_test, y_pred, average='macro')

0.7240433540431175