In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import gensim
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv('../Data/CleanedData.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer(max_features=5000)

In [4]:
X = tfidf.fit_transform(df['transformed text'])
y = df['subject'].values

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state = 42)

In [6]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [10]:
decision_tree = DecisionTreeClassifier()
param_grid = {
    'criterion': ['gini', 'entropy'],          # Splitting criterion
    'max_depth': [40, 50, 60 , 70],             # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],       # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2],            # Minimum samples required to be at a leaf node
}
grid_search = GridSearchCV(decision_tree, param_grid, cv=2, scoring='accuracy', verbose = 3)
grid_search.fit(x_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)
# Use the best estimator to make predictions on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)


NameError: name 'x_train' is not defined

In [8]:
# Evaluate the model's performance on the test data
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Test Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Test Accuracy: 0.7254492753623188
Confusion Matrix:
 [[   0   41    0    3  242    1]
 [   0 1384    0   12  477    3]
 [   2   18  119    0   26    0]
 [   1  169    0    6  658    1]
 [   4  273    0   25 2986  187]
 [   1    1    0    0  223 1762]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       287
           1       0.73      0.74      0.74      1876
           2       1.00      0.72      0.84       165
           3       0.13      0.01      0.01       835
           4       0.65      0.86      0.74      3475
           5       0.90      0.89      0.89      1987

    accuracy                           0.73      8625
   macro avg       0.57      0.54      0.54      8625
weighted avg       0.66      0.73      0.68      8625



# HyperParameter Tuning using HyperOpt

In [11]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval


In [13]:
space = {
    'max_depth': hp.quniform('max_depth', 1, 100, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 50, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 50, 1),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', None, 0.5, 0.75,1,2,3,4]),
    'criterion': hp.choice('criterion', ['gini', 'entropy'])
}


In [14]:
def objective(params):
    # Define hyperparameters
    max_depth = int(params['max_depth'])
    min_samples_split = int(params['min_samples_split'])
    min_samples_leaf = int(params['min_samples_leaf'])
    max_features = params['max_features']
    criterion = params['criterion']
    
    # Create a Decision Tree Classifier with the specified hyperparameters
    clf = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf, max_features=max_features,
                                 criterion=criterion, random_state=42)
    
    # Train the classifier
    clf.fit(X_resampled, y_resampled)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate F1 score
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return {'loss': -f1, 'status': STATUS_OK}

In [15]:
import time
trials = Trials()
start  =time.time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,  # Number of optimization iterations
            trials=trials,
            rstate=np.random.default_rng(42))

# Get the best hyperparameters and the corresponding loss
best_hyperparams = space_eval(space, best)
best_loss = -trials.best_trial['result']['loss']
end = time.time()
print("The time of execution of the above program is:", (end - start) * 1000, "ms")
# Print the best hyperparameters and corresponding loss
print("Best Hyperparameters:")
print(best_hyperparams)
print("Best Weighted F1 Score (Loss):", best_loss)

100%|████████| 50/50 [18:39<00:00, 22.39s/trial, best loss: -0.7157264824553061]
The time of execution of the above program is: 1119761.221408844 ms
Best Hyperparameters:
{'criterion': 'gini', 'max_depth': 87.0, 'max_features': None, 'min_samples_leaf': 50.0, 'min_samples_split': 36.0}
Best Weighted F1 Score (Loss): 0.7157264824553061


In [7]:
decision_tree_classifier = DecisionTreeClassifier(max_depth = 87)
decision_tree_classifier.fit(X_train, y_train)
y_pred = decision_tree_classifier.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print(f"Weighted F1 Score: {f1_weighted:.2f}")

Weighted F1 Score: 0.62


In [16]:
decision_tree_classifier = DecisionTreeClassifier(criterion = 'gini',max_depth = 87,max_features= None,min_samples_leaf=50,min_samples_split=36)
decision_tree_classifier.fit(X_resampled, y_resampled)
y_pred = decision_tree_classifier.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print(f"Weighted F1 Score after OverSampling: {f1_weighted:.2f}")

Weighted F1 Score after OverSampling: 0.72
