In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split


In [47]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [48]:
dataframe = pd.read_csv('./data/labelled_dataset.csv')
dataframe = dataframe.dropna(how='all')
# can be features other than the text column
features = dataframe["Text"]
labels = dataframe["Best Topic"]
targets = ["1", "2", "3", "4", "5", "6"]

In [49]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

<h2> ngram = (1,1)

In [50]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 1)),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)
t1_classifier_ngram = ovr_classifier.fit(X=X_train, y=y_train)
t1_classifier_ngram

In [51]:
predictions_t1 = t1_classifier_ngram.predict(X_test)
t1_score = accuracy_score(predictions_t1, y_test)
print("SVM Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions_t1, target_names=targets))

SVM Accuracy Score ->  0.5164021164021164
              precision    recall  f1-score   support

           1       0.53      0.72      0.61       257
           2       0.43      0.16      0.24       136
           3       0.53      0.60      0.56       202
           4       0.52      0.53      0.52       162
           5       0.25      0.05      0.09        37
           6       0.51      0.47      0.49       151

    accuracy                           0.52       945
   macro avg       0.46      0.42      0.42       945
weighted avg       0.50      0.52      0.49       945



<h2> ngram = (1,2)

In [52]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2)),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)
t1_classifier_ngram = ovr_classifier.fit(X=X_train, y=y_train)
t1_classifier_ngram

In [53]:
predictions_t1 = t1_classifier_ngram.predict(X_test)
t1_score = accuracy_score(predictions_t1, y_test)
print("SVM Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions_t1, target_names=targets))

SVM Accuracy Score ->  0.49947089947089945
              precision    recall  f1-score   support

           1       0.49      0.74      0.59       257
           2       0.44      0.14      0.21       136
           3       0.54      0.56      0.55       202
           4       0.51      0.51      0.51       162
           5       0.33      0.05      0.09        37
           6       0.49      0.42      0.45       151

    accuracy                           0.50       945
   macro avg       0.47      0.41      0.40       945
weighted avg       0.49      0.50      0.47       945



<h2> ngram = (1,3)

In [54]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 3)),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)
t1_classifier_ngram = ovr_classifier.fit(X=X_train, y=y_train)
t1_classifier_ngram

In [55]:
predictions_t1 = t1_classifier_ngram.predict(X_test)
t1_score = accuracy_score(predictions_t1, y_test)
print("SVM Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions_t1, target_names=targets))

SVM Accuracy Score ->  0.49417989417989416
              precision    recall  f1-score   support

           1       0.47      0.75      0.58       257
           2       0.44      0.12      0.19       136
           3       0.54      0.54      0.54       202
           4       0.51      0.52      0.51       162
           5       0.25      0.03      0.05        37
           6       0.50      0.41      0.45       151

    accuracy                           0.49       945
   macro avg       0.45      0.40      0.39       945
weighted avg       0.48      0.49      0.46       945

