In [60]:
# import the libraries
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer

import warnings
warnings.filterwarnings('ignore')

In [66]:
# read the dataset
df = pd.read_csv("./data/CEE_DATA.csv",index_col=False)

In [67]:
X = df[['Gender', 'Caste', 'coaching', 'time',
       'Class_ten_education', 'twelve_education', 'medium',
       'Class_X_Percentage', 'Class_XII_Percentage', 'Father_occupation',
       'Mother_occupation']]
Y = df[['Performance']]

In [68]:
# Descriptive report on the dataset
profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("profiling_report.html")

Summarize dataset: 100%|██████████| 25/25 [00:05<00:00,  4.30it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.38s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.13it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 207.14it/s]


## Train a model with one-hot encoding and label encoding

In [69]:
ode1 = OrdinalEncoder(categories=[["'Average'","'Good'","'Vg'", "'Excellent'"]])
ode2 = OrdinalEncoder(categories=[["'female'","'male'"]])
ohe = OneHotEncoder(sparse=False)

In [80]:
column_transform = make_column_transformer(
    (ohe,['Caste', 'coaching', 'time','Class_ten_education', 'twelve_education', 'medium', 'Father_occupation','Mother_occupation']),
    (ode1,['Class_X_Percentage']),
    (ode1,['Class_XII_Percentage']),
    (ode2,['Gender'])
)
X_transformed = column_transform.fit_transform(X)

array([[1., 0., 0., ..., 3., 3., 1.],
       [0., 1., 0., ..., 3., 3., 1.],
       [0., 1., 0., ..., 3., 3., 1.],
       ...,
       [0., 0., 0., ..., 1., 2., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 2., 1., 0.]])

## Train a model with one-hot encoding (get_dummies)

In [5]:
X = pd.get_dummies(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
print("Train Size Instances: ", X_train.shape[0])
print("Test Size Instances:", X_test.shape[0])
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print("Accuracy: ", metrics.accuracy_score(Y_test, Y_pred))

In [None]:
plt.figure(figsize=(20,12))
tree.plot_tree(clf, fontsize=10, feature_names=list(X.columns), class_names=list(set(Y.values.reshape(-1,))))
plt.show()