In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('../Data/CleanedData.csv')

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=5000)

In [4]:
x= cv.fit_transform(df['transformed text']).toarray()
y= df['subject'].values

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state = 42)

In [7]:
decision_tree = DecisionTreeClassifier()
param_grid = {
    'criterion': ['gini', 'entropy'],          # Splitting criterion
    'max_depth': [None, 10, 20],             # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],       # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2],            # Minimum samples required to be at a leaf node
}
grid_search = GridSearchCV(decision_tree, param_grid, cv=2, scoring='accuracy', verbose = 3)
grid_search.fit(x_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)
# Use the best estimator to make predictions on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)


Fitting 2 folds for each of 36 candidates, totalling 72 fits
[CV 1/2] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.658 total time=  23.5s
[CV 2/2] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.666 total time=  20.1s
[CV 1/2] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5;, score=0.662 total time=  22.8s
[CV 2/2] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5;, score=0.670 total time=  20.3s
[CV 1/2] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10;, score=0.666 total time=  22.0s
[CV 2/2] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10;, score=0.673 total time=  21.1s
[CV 1/2] END criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=2;, score=0.661 total time=  20.3s
[CV 2/2] END criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=2;, score=0.669 total time= 

[CV 2/2] END criterion=entropy, max_depth=20, min_samples_leaf=2, min_samples_split=5;, score=0.689 total time=   8.3s
[CV 1/2] END criterion=entropy, max_depth=20, min_samples_leaf=2, min_samples_split=10;, score=0.695 total time=   7.5s
[CV 2/2] END criterion=entropy, max_depth=20, min_samples_leaf=2, min_samples_split=10;, score=0.693 total time=   8.2s
Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best Accuracy Score: 0.717072463768116


In [8]:
# Evaluate the model's performance on the test data
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Test Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Test Accuracy: 0.7254492753623188
Confusion Matrix:
 [[   0   41    0    3  242    1]
 [   0 1384    0   12  477    3]
 [   2   18  119    0   26    0]
 [   1  169    0    6  658    1]
 [   4  273    0   25 2986  187]
 [   1    1    0    0  223 1762]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       287
           1       0.73      0.74      0.74      1876
           2       1.00      0.72      0.84       165
           3       0.13      0.01      0.01       835
           4       0.65      0.86      0.74      3475
           5       0.90      0.89      0.89      1987

    accuracy                           0.73      8625
   macro avg       0.57      0.54      0.54      8625
weighted avg       0.66      0.73      0.68      8625

