In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train = pd.read_csv('train.tsv',sep='\t',header=None,dtype={0:str,1:str})
dev = pd.read_csv('dev.tsv',sep='\t',header=None,dtype={0:str,1:str})
test = pd.read_csv('test.tsv',sep='\t',header=None,dtype={0:str,1:str})

In [3]:
def vectorize_labels(df):
    y = []

    for label in df[0]:
        label_vec = []

        for cat in label:
            label_vec.append(int(cat))


        y.append(np.array(label_vec))
        
    return y

### Support Vector Machine

In [4]:
def train_svm(train_file):
    train = pd.read_csv(train_file,sep='\t',header=None,dtype={0:str,1:str})
    dev = pd.read_csv('dev.tsv',sep='\t',header=None,dtype={0:str,1:str})
    test = pd.read_csv('test.tsv',sep='\t',header=None,dtype={0:str,1:str})

    tfidf_vectorizer = TfidfVectorizer()

    train_X = tfidf_vectorizer.fit_transform(train[1])
    train_y = vectorize_labels(train)

    dev_X = tfidf_vectorizer.transform(dev[1])
    dev_y = vectorize_labels(dev)

    test_X = tfidf_vectorizer.transform(test[1])
    test_y = vectorize_labels(test)

    clf = OneVsRestClassifier(SVC(probability=True, kernel='linear'))
    clf.fit(train_X, train_y)

    dev_pred = clf.predict(dev_X)
    test_pred = clf.predict(test_X)
    
    return clf, dev_y, test_y, dev_pred, test_pred

In [5]:
clf, dev_y, test_y, dev_pred, test_pred = train_svm('train.tsv')

In [6]:
def eval_model_on_file(clsf, train_file, test_file):
    train = pd.read_csv(train_file,sep='\t',header=None,dtype={0:str,1:str})
    dev = pd.read_csv(test_file,sep='\t',header=None,dtype={0:str,1:str})
    
    tfidf_vectorizer = TfidfVectorizer()

    train_X = tfidf_vectorizer.fit_transform(train[1])
    
    dev_X = tfidf_vectorizer.transform(dev[1])
    dev_y = vectorize_labels(dev)
    
    dev_pred = clsf.predict(dev_X)

    print(metrics.accuracy_score(dev_y, dev_pred),metrics.f1_score(dev_y, dev_pred, average='micro'))
    
    return dev_pred

In [7]:
dev_pred = eval_model_on_file(clf, 'train.tsv','dev.tsv')

0.7119305856832971 0.8335969073976454


In [32]:
from sklearn.metrics import multilabel_confusion_matrix
print(multilabel_confusion_matrix(dev_y,dev_pred))

[[[1751   57]
  [ 149  348]]

 [[1541   94]
  [ 133  537]]

 [[1946   35]
  [  98  226]]

 [[2189    6]
  [  87   23]]

 [[1085  117]
  [  82 1021]]

 [[2098   18]
  [  53  136]]

 [[2257    5]
  [  25   18]]

 [[2264    0]
  [  35    6]]]


In [33]:
import sklearn.metrics as skm
print(skm.classification_report(test_y,test_pred))

              precision    recall  f1-score   support

           0       0.85      0.69      0.77       929
           1       0.85      0.80      0.82      1401
           2       0.85      0.70      0.77       734
           3       0.81      0.21      0.34       202
           4       0.90      0.90      0.90      2190
           5       0.89      0.71      0.79       387
           6       0.86      0.46      0.60        79
           7       0.91      0.17      0.29        58

   micro avg       0.87      0.77      0.82      5980
   macro avg       0.87      0.58      0.66      5980
weighted avg       0.87      0.77      0.81      5980
 samples avg       0.83      0.81      0.81      5980



In [9]:
from sklearn import metrics

In [10]:
tfidf_vectorizer = TfidfVectorizer()

train_X = tfidf_vectorizer.fit_transform(train[1])
train_y = vectorize_labels(train)

dev_X = tfidf_vectorizer.transform(dev[1])
dev_y = vectorize_labels(dev)

test_X = tfidf_vectorizer.transform(test[1])
test_y = vectorize_labels(test)

### Logistic Regression

In [11]:
lr = OneVsRestClassifier(LogisticRegression())
lr.fit(train_X, train_y)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [15]:
dev_pred0 = lr.predict(dev_X)
test_pred0 = lr.predict(test_X)
print("Accuracy:")
metrics.accuracy_score(dev_y, dev_pred0),metrics.f1_score(dev_y, dev_pred0, average='micro')

Accuracy:


(0.6854663774403471, 0.8142493638676844)

In [21]:
from sklearn.metrics import multilabel_confusion_matrix
print(multilabel_confusion_matrix(dev_y,dev_pred0))

[[[1764   44]
  [ 169  328]]

 [[1552   83]
  [ 154  516]]

 [[1946   35]
  [ 106  218]]

 [[2192    3]
  [  84   26]]

 [[1097  105]
  [  81 1022]]

 [[2107    9]
  [  77  112]]

 [[2256    6]
  [  29   14]]

 [[2264    0]
  [  37    4]]]


In [27]:
import sklearn.metrics as skm
print(skm.classification_report(test_y,test_pred0))

              precision    recall  f1-score   support

           0       0.86      0.67      0.75       929
           1       0.87      0.77      0.82      1401
           2       0.86      0.63      0.73       734
           3       0.78      0.24      0.37       202
           4       0.92      0.91      0.91      2190
           5       0.91      0.63      0.74       387
           6       0.81      0.43      0.56        79
           7       1.00      0.12      0.22        58

   micro avg       0.89      0.75      0.81      5980
   macro avg       0.88      0.55      0.64      5980
weighted avg       0.89      0.75      0.80      5980
 samples avg       0.82      0.79      0.79      5980



### Decision Tree

In [14]:
dt=DecisionTreeClassifier()
dt.fit(train_X,train_y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [17]:
dev_pred2 = dt.predict(dev_X)
test_pred2 = dt.predict(test_X)
print("Accuracy:")
metrics.accuracy_score(dev_y, dev_pred2),metrics.f1_score(dev_y, dev_pred2, average='micro')

Accuracy:


(0.5262472885032538, 0.654007729793312)

In [24]:
from sklearn.metrics import multilabel_confusion_matrix
print(multilabel_confusion_matrix(dev_y,dev_pred2))

[[[1615  193]
  [ 225  272]]

 [[1405  230]
  [ 224  446]]

 [[1833  148]
  [ 144  180]]

 [[2126   69]
  [  77   33]]

 [[ 938  264]
  [ 212  891]]

 [[2032   84]
  [  79  110]]

 [[2243   19]
  [  35    8]]

 [[2243   21]
  [  35    6]]]


In [26]:
import sklearn.metrics as skm
print(skm.classification_report(test_y,test_pred2))

              precision    recall  f1-score   support

           0       0.55      0.53      0.54       929
           1       0.66      0.63      0.65      1401
           2       0.56      0.54      0.55       734
           3       0.32      0.28      0.30       202
           4       0.77      0.78      0.78      2190
           5       0.55      0.54      0.54       387
           6       0.35      0.32      0.33        79
           7       0.18      0.22      0.20        58

   micro avg       0.65      0.64      0.64      5980
   macro avg       0.49      0.48      0.49      5980
weighted avg       0.64      0.64      0.64      5980
 samples avg       0.67      0.67      0.65      5980



### Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(train_X,train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [35]:
dev_pred3 = dt.predict(dev_X)
test_pred3 = dt.predict(test_X)
print("Accuracy:")
metrics.accuracy_score(dev_y, dev_pred3),metrics.f1_score(dev_y, dev_pred3, average='micro')

Accuracy:


(0.5262472885032538, 0.654007729793312)

In [36]:
from sklearn.metrics import multilabel_confusion_matrix
print(multilabel_confusion_matrix(dev_y,dev_pred3))

[[[1615  193]
  [ 225  272]]

 [[1405  230]
  [ 224  446]]

 [[1833  148]
  [ 144  180]]

 [[2126   69]
  [  77   33]]

 [[ 938  264]
  [ 212  891]]

 [[2032   84]
  [  79  110]]

 [[2243   19]
  [  35    8]]

 [[2243   21]
  [  35    6]]]


In [37]:
import sklearn.metrics as skm
print(skm.classification_report(test_y,test_pred3))

              precision    recall  f1-score   support

           0       0.55      0.53      0.54       929
           1       0.66      0.63      0.65      1401
           2       0.56      0.54      0.55       734
           3       0.32      0.28      0.30       202
           4       0.77      0.78      0.78      2190
           5       0.55      0.54      0.54       387
           6       0.35      0.32      0.33        79
           7       0.18      0.22      0.20        58

   micro avg       0.65      0.64      0.64      5980
   macro avg       0.49      0.48      0.49      5980
weighted avg       0.64      0.64      0.64      5980
 samples avg       0.67      0.67      0.65      5980

