In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [3]:
dataframe = pd.read_csv('../data/labelled_dataset.csv')
dataframe = dataframe.dropna(how='all')
# can be features other than the text column
features = dataframe["Text"]
labels = dataframe["Best Topic"]
targets = ["1", "2", "3"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

<h2> ngram = (1,1)

In [5]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 1)),
    OneVsRestClassifier(RandomForestClassifier(min_samples_split=15, criterion='gini'))
)
randomforest_mode = ovr_classifier.fit(X=X_train, y=y_train)
randomforest_mode

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=RandomForestClassifier(min_samples_split=15)))])

In [6]:
predictions = randomforest_mode.predict(X_test)
t1_score = accuracy_score(predictions, y_test)
print("Random Forest Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions, target_names=targets))

Random Forest Accuracy Score ->  0.8127659574468085
              precision    recall  f1-score   support

           1       0.76      0.78      0.77       316
           2       0.84      0.94      0.89       521
           3       0.90      0.26      0.41       103

    accuracy                           0.81       940
   macro avg       0.83      0.66      0.69       940
weighted avg       0.82      0.81      0.79       940



<h2> ngram = (1,2)

In [7]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2)),
    OneVsRestClassifier(RandomForestClassifier(min_samples_split=15, criterion='gini'))
)
randomforest_mode = ovr_classifier.fit(X=X_train, y=y_train)
randomforest_mode

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, 2))),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=RandomForestClassifier(min_samples_split=15)))])

In [8]:
predictions = randomforest_mode.predict(X_test)
t1_score = accuracy_score(predictions, y_test)
print("Random Forest Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions, target_names=targets))

Random Forest Accuracy Score ->  0.8
              precision    recall  f1-score   support

           1       0.76      0.75      0.75       316
           2       0.82      0.94      0.88       521
           3       0.86      0.24      0.38       103

    accuracy                           0.80       940
   macro avg       0.81      0.64      0.67       940
weighted avg       0.80      0.80      0.78       940



<h2> ngram = (1,3)

In [9]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    # CountVectorizer(),
    TfidfVectorizer(ngram_range=(1, 3)),
    OneVsRestClassifier(RandomForestClassifier(min_samples_split=15, criterion='gini'))
)
randomforest_mode = ovr_classifier.fit(X=X_train, y=y_train)
randomforest_mode

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, 3))),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=RandomForestClassifier(min_samples_split=15)))])

In [10]:
predictions = randomforest_mode.predict(X_test)
t1_score = accuracy_score(predictions, y_test)
print("Random Forest Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions, target_names=targets))

Random Forest Accuracy Score ->  0.7946808510638298
              precision    recall  f1-score   support

           1       0.76      0.73      0.75       316
           2       0.81      0.95      0.87       521
           3       0.88      0.22      0.36       103

    accuracy                           0.79       940
   macro avg       0.82      0.63      0.66       940
weighted avg       0.80      0.79      0.77       940

