In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix


In [42]:
data = pd.read_csv('Exam_Score_Prediction.csv')

In [43]:
data.head()

Unnamed: 0,student_id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,1,17,male,diploma,2.78,92.9,yes,7.4,poor,coaching,low,hard,58.9
1,2,23,other,bca,3.37,64.8,yes,4.6,average,online videos,medium,moderate,54.8
2,3,22,male,b.sc,7.88,76.8,yes,8.5,poor,coaching,high,moderate,90.3
3,4,20,other,diploma,0.67,48.4,yes,5.8,average,online videos,low,moderate,29.7
4,5,20,female,diploma,0.89,71.6,yes,9.8,poor,coaching,low,moderate,43.7


In [44]:
data['pass_fail'] = data['exam_score'].apply(lambda x: 'pass' if x>= 70 else 'fail')
data = data.drop(columns=['exam_score'])

In [45]:
data

Unnamed: 0,student_id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,pass_fail
0,1,17,male,diploma,2.78,92.9,yes,7.4,poor,coaching,low,hard,fail
1,2,23,other,bca,3.37,64.8,yes,4.6,average,online videos,medium,moderate,fail
2,3,22,male,b.sc,7.88,76.8,yes,8.5,poor,coaching,high,moderate,pass
3,4,20,other,diploma,0.67,48.4,yes,5.8,average,online videos,low,moderate,fail
4,5,20,female,diploma,0.89,71.6,yes,9.8,poor,coaching,low,moderate,fail
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19997,18,other,bba,6.50,71.3,yes,5.0,good,self-study,low,easy,pass
19996,19998,18,male,b.com,3.71,41.6,no,5.9,average,coaching,medium,moderate,fail
19997,19999,19,other,diploma,7.88,68.2,yes,4.6,poor,group study,low,easy,fail
19998,20000,19,male,bba,4.60,76.3,no,6.1,good,self-study,medium,moderate,pass


In [46]:
X = data.iloc[:, 0:12]
y = data.iloc[:, 12]

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.2)

In [48]:
X_train.shape

(16000, 12)

In [49]:
X_test.shape

(4000, 12)

In [50]:
y_train.shape

(16000,)

In [51]:
y_test.shape

(4000,)

In [69]:
categorical_columns = ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']
numeric_columns = ['student_id', 'age', 'study_hours', 'class_attendance', 'sleep_hours']

In [70]:
preprocess = ColumnTransformer([
    ('cc', OneHotEncoder(drop='first'), categorical_columns),
    ('nc', 'passthrough', numeric_columns)
])

In [71]:
new_X_train = preprocess.fit_transform(X_train)
new_X_test = preprocess.transform(X_test)

In [62]:
dt = DecisionTreeClassifier()

In [27]:
dt.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [72]:
dt.fit(new_X_train, y_train)

In [73]:
y_pred = dt.predict(new_X_test)

In [76]:
print(confusion_matrix(y_test, y_pred))

[[2038  514]
 [ 454  994]]
