In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split


In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [9]:
dataframe = pd.read_csv('./data/labelled_dataset.csv')
dataframe = dataframe.dropna(how='all')
# can be features other than the text column
features = dataframe["Text"]
labels = dataframe["Best Topic"]
targets = ["1", "2", "3", "4", "5", "6"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

<h2> ngram = (1,1)

In [11]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 1)),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)
t1_classifier_ngram = ovr_classifier.fit(X=X_train, y=y_train)
t1_classifier_ngram

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [12]:
predictions_t1 = t1_classifier_ngram.predict(X_test)
t1_score = accuracy_score(predictions_t1, y_test)
print("SVM Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions_t1, target_names=targets))

SVM Accuracy Score ->  0.6606382978723404
              precision    recall  f1-score   support

           1       0.63      0.74      0.68       163
           2       0.71      0.59      0.65       138
           3       0.67      0.69      0.68       179
           4       0.68      0.80      0.74       214
           5       0.76      0.32      0.45        82
           6       0.60      0.59      0.60       164

    accuracy                           0.66       940
   macro avg       0.68      0.62      0.63       940
weighted avg       0.67      0.66      0.65       940



<h2> ngram = (1,2)

In [13]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2)),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)
t1_classifier_ngram = ovr_classifier.fit(X=X_train, y=y_train)
t1_classifier_ngram

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, 2))),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [14]:
predictions_t1 = t1_classifier_ngram.predict(X_test)
t1_score = accuracy_score(predictions_t1, y_test)
print("SVM Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions_t1, target_names=targets))

SVM Accuracy Score ->  0.6574468085106383
              precision    recall  f1-score   support

           1       0.64      0.74      0.69       163
           2       0.71      0.57      0.63       138
           3       0.65      0.70      0.67       179
           4       0.66      0.82      0.73       214
           5       0.80      0.34      0.48        82
           6       0.61      0.55      0.58       164

    accuracy                           0.66       940
   macro avg       0.68      0.62      0.63       940
weighted avg       0.67      0.66      0.65       940



<h2> ngram = (1,3)

In [15]:
# Instantiate Classifier
ovr_classifier = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 3)),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)
t1_classifier_ngram = ovr_classifier.fit(X=X_train, y=y_train)
t1_classifier_ngram

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, 3))),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [16]:
predictions_t1 = t1_classifier_ngram.predict(X_test)
t1_score = accuracy_score(predictions_t1, y_test)
print("SVM Accuracy Score -> ", t1_score)
print(classification_report(y_test, predictions_t1, target_names=targets))

SVM Accuracy Score ->  0.65
              precision    recall  f1-score   support

           1       0.62      0.74      0.68       163
           2       0.70      0.57      0.63       138
           3       0.62      0.70      0.66       179
           4       0.66      0.81      0.73       214
           5       0.79      0.32      0.45        82
           6       0.62      0.54      0.58       164

    accuracy                           0.65       940
   macro avg       0.67      0.61      0.62       940
weighted avg       0.66      0.65      0.64       940

