In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split


In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [20]:
dataframe = pd.read_csv('./data/labelled_dataset.csv')
dataframe = dataframe.dropna(how='all')
# can be features other than the text column
features = dataframe["Text"]
labels = dataframe["Best Topic"]
targets = ["1", "2", "3", "4", "5", "6"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

<h2> ngram = (1,1)

In [22]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 1)),
    OneVsRestClassifier(RandomForestClassifier(min_samples_split=15, criterion='gini'))
)
randomforest_mode = ovr_classifier.fit(X=X_train, y=y_train)
randomforest_mode

In [23]:
predictions = randomforest_mode.predict(X_test)
t1_score = accuracy_score(predictions, y_test)
print("Random Forest Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions, target_names=targets))

Random Forest Accuracy Score ->  0.4648936170212766
              precision    recall  f1-score   support

           1       0.42      0.60      0.50       193
           2       0.49      0.29      0.37       158
           3       0.53      0.52      0.53       194
           4       0.48      0.55      0.51       201
           5       0.33      0.15      0.21        52
           6       0.43      0.39      0.41       142

    accuracy                           0.46       940
   macro avg       0.45      0.42      0.42       940
weighted avg       0.47      0.46      0.46       940



<h2> ngram = (1,2)

In [24]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2)),
    OneVsRestClassifier(RandomForestClassifier(min_samples_split=15, criterion='gini'))
)
randomforest_mode = ovr_classifier.fit(X=X_train, y=y_train)
randomforest_mode

In [25]:
predictions = randomforest_mode.predict(X_test)
t1_score = accuracy_score(predictions, y_test)
print("Random Forest Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions, target_names=targets))

Random Forest Accuracy Score ->  0.45
              precision    recall  f1-score   support

           1       0.43      0.56      0.49       193
           2       0.55      0.27      0.36       158
           3       0.46      0.50      0.48       194
           4       0.44      0.54      0.49       201
           5       0.48      0.19      0.27        52
           6       0.41      0.39      0.40       142

    accuracy                           0.45       940
   macro avg       0.46      0.41      0.42       940
weighted avg       0.46      0.45      0.44       940



<h2> ngram = (1,3)

In [26]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    # CountVectorizer(),
    TfidfVectorizer(ngram_range=(1, 3)),
    OneVsRestClassifier(RandomForestClassifier(min_samples_split=15, criterion='gini'))
)
randomforest_mode = ovr_classifier.fit(X=X_train, y=y_train)
randomforest_mode

In [27]:
predictions = randomforest_mode.predict(X_test)
t1_score = accuracy_score(predictions, y_test)
print("Random Forest Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions, target_names=targets))

Random Forest Accuracy Score ->  0.4478723404255319
              precision    recall  f1-score   support

           1       0.43      0.56      0.48       193
           2       0.56      0.25      0.34       158
           3       0.46      0.50      0.48       194
           4       0.44      0.55      0.49       201
           5       0.37      0.19      0.25        52
           6       0.45      0.39      0.42       142

    accuracy                           0.45       940
   macro avg       0.45      0.41      0.41       940
weighted avg       0.46      0.45      0.44       940

