# Sheet 08

In [3]:
import pandas as pd
import numpy as np
from stop_words import get_stop_words
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

## Exercise 1: Multi-label classification

In [4]:
df = pd.read_csv('../data/JanataHack.csv', index_col='ID')
df

Unnamed: 0_level_0,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...
20968,Contemporary machine learning: a guide for pra...,Machine learning is finding increasingly bro...,1,1,0,0,0,0
20969,Uniform diamond coatings on WC-Co hard alloy c...,Polycrystalline diamond coatings have been g...,0,1,0,0,0,0
20970,Analysing Soccer Games with Clustering and Con...,We present a new approach for identifying si...,1,0,0,0,0,0
20971,On the Efficient Simulation of the Left-Tail o...,The sum of Log-normal variates is encountere...,0,0,1,1,0,0


In [5]:
df.rename(columns={"TITLE": "title", "ABSTRACT" : "abstract"}, inplace=True)

In [6]:
# labels for the categories
categories = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]

In [7]:
def add_category_as_single_column(df):
    df['category'] = df[categories[0]].astype(str) + df[categories[1]].astype(str) + df[categories[2]].astype(str) + df[categories[3]].astype(str) + df[categories[4]].astype(str) + df[categories[5]].astype(str)
    return df

In [8]:
df = add_category_as_single_column(df)
df

Unnamed: 0_level_0,title,abstract,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,100000
2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,100000
3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0,001000
4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0,001000
5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0,100100
...,...,...,...,...,...,...,...,...,...
20968,Contemporary machine learning: a guide for pra...,Machine learning is finding increasingly bro...,1,1,0,0,0,0,110000
20969,Uniform diamond coatings on WC-Co hard alloy c...,Polycrystalline diamond coatings have been g...,0,1,0,0,0,0,010000
20970,Analysing Soccer Games with Clustering and Con...,We present a new approach for identifying si...,1,0,0,0,0,0,100000
20971,On the Efficient Simulation of the Left-Tail o...,The sum of Log-normal variates is encountere...,0,0,1,1,0,0,001100


In [9]:
labels = df['category'].unique()
print(labels)
print(f'Length of unique categories: {len(labels)}')

['100000' '001000' '100100' '010000' '000010' '000100' '011000' '001100'
 '101000' '000001' '110000' '101100' '010100' '110100' '100010' '000110'
 '000101' '011100' '100001' '000011' '100110' '111000' '100101' '001101']
Length of unique categories: 24


First we need to do some cleaning.

In [10]:
STOPWORDS = set(get_stop_words('english'))

def text_cleaning(text):
    # transform to lowercase for later use of 'stopwords'
    text = text.lower()
    
    # remove numbers
    text = re.sub(r'\w*\d+\w*', '', text)

    # remove formulas
    text = re.sub(r'\$[^$$]*\$', '', text)

    # remove stop words
    text = ' '.join([word for word in text.split(' ') if word not in STOPWORDS])

    # remove symbols
    text = re.compile('[/(){}\[\]\|@,;~]').sub(' ', text)

    # replace over spaces
    text = re.sub('\s{2,}', " ", text)

    return text

In [11]:
df['abstract_cleaned'] = df['abstract'].apply(text_cleaning)

## b) Apply Classification Algorithm

Prepare the train and test data for the classification algorithms.

In [12]:
X = df.abstract_cleaned
y = df.category

In [13]:
# prepare training and test data from dataframe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Naives Bayes Classifier
description: https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes

In [14]:
pipe_nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
pipe_nb.fit(X_train, y_train)

y_pred = pipe_nb.predict(X_test)

In [15]:
# evaluation
print(f'accuracy of naives bayes classifier {accuracy_score(y_pred, y_test)}')
print(classification_report(y_test, y_pred, zero_division = 0))

accuracy of naives bayes classifier 0.6052443384982121
              precision    recall  f1-score   support

      000001       0.00      0.00      0.00        39
      000010       0.00      0.00      0.00        97
      000011       0.00      0.00      0.00         1
      000100       0.00      0.00      0.00       318
      000101       0.00      0.00      0.00         6
      000110       0.00      0.00      0.00        25
      001000       0.80      0.87      0.83       669
      001100       0.00      0.00      0.00       176
      001101       0.00      0.00      0.00         1
      010000       0.85      0.94      0.89      1077
      010100       0.00      0.00      0.00        20
      011000       0.00      0.00      0.00        60
      011100       0.00      0.00      0.00         2
      100000       0.42      0.95      0.58       992
      100010       0.00      0.00      0.00         6
      100100       0.46      0.03      0.05       454
      100110       0.00   

### Linear Support Vector Machine
description: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier

In [16]:
pipe_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))])
pipe_svm.fit(X_train, y_train)

y_pred = pipe_svm.predict(X_test)

In [17]:
# evaluation
print(f'accuracy of naives bayes classifier {accuracy_score(y_pred, y_test)}')
print(classification_report(y_test, y_pred, zero_division = 0))

accuracy of naives bayes classifier 0.6424314660309892
              precision    recall  f1-score   support

      000001       0.44      0.21      0.28        39
      000010       0.23      0.05      0.08        97
      000011       0.00      0.00      0.00         1
      000100       0.49      0.11      0.18       318
      000101       0.00      0.00      0.00         6
      000110       0.00      0.00      0.00        25
      001000       0.65      0.91      0.76       669
      001100       0.56      0.44      0.49       176
      001101       0.00      0.00      0.00         1
      010000       0.73      0.96      0.83      1077
      010100       0.00      0.00      0.00        20
      011000       0.00      0.00      0.00        60
      011100       0.00      0.00      0.00         2
      100000       0.60      0.81      0.69       992
      100010       0.00      0.00      0.00         6
      100100       0.51      0.28      0.36       454
      100101       0.00   

### k Nearest Neighbor

In [18]:
pipe_rfc = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())])
pipe_rfc.fit(X_train, y_train)

y_pred = pipe_rfc.predict(X_test)

In [19]:
# evaluation
print(f'accuracy of naives bayes classifier {accuracy_score(y_pred, y_test)}')
print(classification_report(y_test, y_pred, zero_division = 0))

accuracy of naives bayes classifier 0.6240762812872467
              precision    recall  f1-score   support

      000001       0.00      0.00      0.00        39
      000010       0.00      0.00      0.00        97
      000011       0.00      0.00      0.00         1
      000100       0.52      0.10      0.16       318
      000101       0.00      0.00      0.00         6
      000110       0.00      0.00      0.00        25
      001000       0.67      0.91      0.77       669
      001100       0.82      0.23      0.36       176
      001101       0.00      0.00      0.00         1
      010000       0.81      0.92      0.86      1077
      010100       0.00      0.00      0.00        20
      011000       0.00      0.00      0.00        60
      011100       0.00      0.00      0.00         2
      100000       0.48      0.84      0.62       992
      100010       0.00      0.00      0.00         6
      100100       0.49      0.23      0.31       454
      100110       0.00   