In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,es/nes,ci,course,sem_type,class_size,score
0,1,23,3,1,19,3
1,2,15,3,1,17,3
2,1,23,3,2,49,3
3,1,5,2,2,33,3
4,2,7,11,2,55,3


In [None]:
df['es/nes'] = df['es/nes'].fillna(df['es/nes'].mode()[0])
df['ci'] = df['ci'].fillna(df['ci'].mode()[0])
df['course'] = df['course'].fillna(df['course'].mode()[0])
df['sem_type'] = df['sem_type'].fillna(df['sem_type'].mode()[0])
df['class_size'] = df['class_size'].fillna(df['class_size'].mean)
df['score'] = df['score'].fillna(df['score'].mode()[0])

In [None]:
Y = df['score']
X = df.drop(['score'], axis='columns')
X

Unnamed: 0,es/nes,ci,course,sem_type,class_size
0,1,23,3,1,19
1,2,15,3,1,17
2,1,23,3,2,49
3,1,5,2,2,33
4,2,7,11,2,55
...,...,...,...,...,...
146,2,3,2,2,26
147,2,10,3,2,12
148,1,18,7,2,48
149,2,22,1,2,51


In [None]:
course_dummies = pd.get_dummies(X['course'], prefix='course', drop_first=True)
speaker_dummies = pd.get_dummies(X['es/nes'], prefix='es/nes', drop_first=True)
ci_dummies = pd.get_dummies(X['ci'], prefix='ci', drop_first=True)
X = pd.concat([X, course_dummies, ci_dummies, speaker_dummies], axis=1)
X = X.drop(['course', 'es/nes', 'ci'], axis=1)
scaler = MaxAbsScaler()
X['class_size'] = scaler.fit_transform(df[['class_size']])
X

Unnamed: 0,sem_type,class_size,course_2,course_3,course_4,course_5,course_6,course_7,course_8,course_9,...,ci_17,ci_18,ci_19,ci_20,ci_21,ci_22,ci_23,ci_24,ci_25,es/nes_2
0,1,0.287879,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0.257576,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0.742424,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2,0.500000,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0.833333,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,2,0.393939,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
147,2,0.181818,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
148,2,0.727273,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
149,2,0.772727,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [None]:
param_dist = [{
                'n_estimators': [20, 50, 100, 200, 300, 400, 500],
                'criterion': ['entropy', 'gini'],
                'max_depth': [1, 2, 5, 10, 50, 100, 200, 300],
                'max_features': ['log2', 'sqrt', None]
              }]
rf = RandomForestClassifier(criterion='entropy', max_depth=200, max_features=None,
                       n_estimators=20)
clf = RandomizedSearchCV(rf, param_distributions=param_dist, cv=5, n_iter=40)

best_clf = clf.fit(x_train, y_train)

In [None]:
best_clf.best_estimator_

In [None]:
y_pred = best_clf.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')


In [None]:
print('Accuracy: {:.3f}'.format(accuracy))
print('Precision: {:.3f}'.format(precision))
print('Recall: {:.3f}'.format(recall))
print('F1-score: {:.3f}'.format(f1))

Accuracy: 0.812
Precision: 0.861
Recall: 0.812
F1-score: 0.801


In [None]:
model = best_clf.best_estimator_

In [None]:
import joblib
joblib.dump(model, 'model_joblib')

['model_joblib']