In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split


In [49]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [66]:
dataframe = pd.read_csv('./data/labelled_dataset.csv')
dataframe = dataframe.dropna(how='all')
# can be features other than the text column
features = dataframe["Text"]
columns = [f"Topic {i+1}" for i in range(6)]
labels = dataframe[columns]
targets = ["1", "2", "3", "4", "5", "6"]

In [68]:
labels

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6
0,0.872072,0.000000,0.000000,0.090702,0.000000,0.000000
1,0.056015,0.056168,0.055559,0.720877,0.055559,0.055822
2,0.010428,0.010424,0.010568,0.418144,0.010423,0.540014
3,0.131990,0.018559,0.018721,0.573137,0.018559,0.239034
4,0.041692,0.041692,0.041819,0.542127,0.290956,0.041715
...,...,...,...,...,...,...
4691,0.055780,0.055779,0.055781,0.055782,0.390796,0.386081
4692,0.028045,0.185259,0.028051,0.028246,0.193463,0.536937
4693,0.055651,0.055651,0.055652,0.055652,0.388764,0.388631
4694,0.055619,0.055618,0.055619,0.388053,0.055620,0.389472


In [69]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

<h2> ngram = (1,1)

In [72]:
from sklearn.multioutput import ClassifierChain

# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 1)),
    ClassifierChain(RandomForestClassifier(min_samples_split=15, criterion='gini'))
)
randomforest_mode = ovr_classifier.fit(X=X_train, y=y_train)
randomforest_mode

ValueError: Unknown label type: 'continuous'

In [53]:
predictions = randomforest_mode.predict(X_test)
t1_score = accuracy_score(predictions, y_test)
print("SVM Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions, target_names=targets))

SVM Accuracy Score ->  0.46702127659574466
              precision    recall  f1-score   support

           1       0.43      0.61      0.50       193
           2       0.50      0.30      0.37       158
           3       0.52      0.52      0.52       194
           4       0.48      0.54      0.51       201
           5       0.36      0.17      0.23        52
           6       0.44      0.39      0.42       142

    accuracy                           0.47       940
   macro avg       0.46      0.42      0.43       940
weighted avg       0.47      0.47      0.46       940



<h2> ngram = (1,2)

In [54]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2)),
    OneVsRestClassifier(RandomForestClassifier(min_samples_split=15, criterion='gini'))
)
randomforest_mode = ovr_classifier.fit(X=X_train, y=y_train)
randomforest_mode

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, 2))),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=RandomForestClassifier(min_samples_split=15)))])

In [55]:
predictions = randomforest_mode.predict(X_test)
t1_score = accuracy_score(predictions, y_test)
print("SVM Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions, target_names=targets))

SVM Accuracy Score ->  0.4595744680851064
              precision    recall  f1-score   support

           1       0.43      0.55      0.48       193
           2       0.53      0.29      0.38       158
           3       0.48      0.50      0.49       194
           4       0.45      0.57      0.50       201
           5       0.44      0.21      0.29        52
           6       0.46      0.41      0.43       142

    accuracy                           0.46       940
   macro avg       0.46      0.42      0.43       940
weighted avg       0.47      0.46      0.45       940



<h2> ngram = (1,3)

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate Classifier
ovr_classifier = make_pipeline(
    CountVectorizer(),
    TfidfVectorizer(ngram_range=(1, 3)),
    # OneVsRestClassifier(RandomForestClassifier(min_samples_split=15, criterion='gini'))
)
randomforest_mode = ovr_classifier.fit(X=X_train, y=y_train)
randomforest_mode

AttributeError: lower not found

In [57]:
predictions = randomforest_mode.predict(X_test)
t1_score = accuracy_score(predictions, y_test)
print("SVM Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions, target_names=targets))

SVM Accuracy Score ->  0.451063829787234
              precision    recall  f1-score   support

           1       0.45      0.57      0.50       193
           2       0.56      0.28      0.37       158
           3       0.45      0.49      0.47       194
           4       0.43      0.54      0.48       201
           5       0.33      0.15      0.21        52
           6       0.44      0.41      0.42       142

    accuracy                           0.45       940
   macro avg       0.44      0.41      0.41       940
weighted avg       0.46      0.45      0.44       940

