In [1]:
import numpy as np
import pandas as pd 
from nltk.corpus import stopwords

from sklearn.model_selection import ShuffleSplit, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, \
                            accuracy_score, top_k_accuracy_score
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
names = ['doi', 'text_id', 'text', 'sdg', 'labels_negative', 
         'labels_positive', 'agreement']

# laod data
df = pd.read_csv('osdg.csv', error_bad_lines=False,
                sep="\t", header= None, names=names)
df = df.iloc[1:, :] 
df = df.drop(['doi', 'text_id', 'labels_negative', 
              'labels_positive'], axis = 1)

priors = list(df.sdg.value_counts().values)

# define discretization
bins = [-1, 0.2, 0.4, 0.6, 0.8, 2]
labels = np.linspace(1,10,5)

# develop agreement variable
df.agreement = df.agreement.astype(np.float16)
df['cat_agreement'] = pd.cut(df.agreement, bins=bins, 
       labels = labels)

X_train, X_test, \
y_train, y_test  = train_test_split(df[['text', 'agreement', 'cat_agreement']], df.sdg,
                               test_size=0.15, random_state=0, 
                               stratify=df[['sdg', 'cat_agreement']])

In [3]:
del df

In [4]:
def evaluate(model, x_test, y_test, k):
    # transform test set
    features = tfidf.transform(list(x_test.text)).toarray()
    x_test_ = np.concatenate((features, x_test.agreement.values.reshape(-1,1)), axis=1)
    del features
    # evaluate
    probs = model.predict_proba(x_test_)
    print(f'Top {k} accuracy %s' % top_k_accuracy_score(y_test, probs, k=k), '\n\n')
    
    y_pred = model.predict(x_test_)
    print(classification_report(y_test, y_pred))

# Multinomial NB

In [5]:
# Optimal parameters
alpha =  0.01
sublinear_df = False
min_df = 2
k=3

# vectorizer
tfidf = TfidfVectorizer(sublinear_tf=sublinear_df, min_df=min_df, max_features = 72400,
                                stop_words='english', ngram_range=(1, 2), dtype = np.float32)
features = tfidf.fit_transform(list(X_train.text)).toarray()
X_t = np.concatenate((features, X_train.agreement.values.reshape(-1,1)), axis=1)

# delete for memory
del features

# Train model with optimal parameters
clf = MultinomialNB(alpha = alpha, 
                        fit_prior=True, 
                        class_prior=priors).fit(X_t, y_train,
                        sample_weight = None)
# delete for memory
del X_t

# Evaluate top 3 accuracy
evaluate(clf, X_test, y_test, 3)

Top 3 accuracy 0.9082606890826069 


              precision    recall  f1-score   support

           1       0.64      0.77      0.70       412
          10       0.42      0.66      0.51       155
          11       0.68      0.80      0.73       344
          12       0.49      0.71      0.58        69
          13       0.73      0.84      0.78       314
          14       0.84      0.81      0.82       163
          15       0.62      0.78      0.69       144
           2       0.76      0.77      0.77       369
           3       0.89      0.85      0.87       403
           4       0.87      0.83      0.85       561
           5       0.88      0.75      0.81       649
           6       0.86      0.71      0.78       423
           7       0.83      0.77      0.80       422
           8       0.54      0.39      0.46       227
           9       0.74      0.52      0.61       163

    accuracy                           0.76      4818
   macro avg       0.72      0.73      0.72

In [6]:
del tfidf, clf

# SVM

In [7]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, max_features = 72400,
                                stop_words='english', ngram_range=(1, 2), dtype = np.float32)

features = tfidf.fit_transform(list(X_train.text)).toarray()
X_t = np.concatenate((features, X_train.agreement.values.reshape(-1,1)), axis=1)

In [8]:
del features

In [9]:
test_values = list(y_train.value_counts().values)
priors = {}
for key in y_train.value_counts().index:
    for value in test_values:
        priors[key] = value
        test_values.remove(value)
        break

In [10]:
clf = SGDClassifier(loss="hinge", class_weight=priors, penalty="l2").fit(X_t, y_train,
                        sample_weight = None)

In [11]:
del X_t

In [12]:
features = tfidf.transform(list(X_test.text)).toarray()
x_test_ = np.concatenate((features, X_test.agreement.values.reshape(-1,1)), axis=1)
del features

y_pred = clf.predict(x_test_)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.54      0.83      0.66       412
          10       0.00      0.00      0.00       155
          11       0.90      0.41      0.57       344
          12       1.00      0.04      0.08        69
          13       0.88      0.56      0.68       314
          14       0.93      0.44      0.59       163
          15       0.84      0.15      0.25       144
           2       0.84      0.58      0.68       369
           3       0.95      0.65      0.77       403
           4       0.87      0.72      0.79       561
           5       0.68      0.85      0.76       649
           6       0.60      0.82      0.69       423
           7       0.44      0.92      0.59       422
           8       0.32      0.62      0.42       227
           9       0.85      0.07      0.12       163

    accuracy                           0.64      4818
   macro avg       0.71      0.51      0.51      4818
weighted avg       0.71   

In [None]:
del tfidf, clf

In [14]:
del x_test_

# Logistic Regression

In [15]:
tfidf = TfidfVectorizer(sublinear_tf=False, min_df=1, max_features = 72400,
                                stop_words='english', ngram_range=(1, 2), dtype = np.float32)

features = tfidf.fit_transform(list(X_train.text)).toarray()
X_t = np.concatenate((features, X_train.agreement.values.reshape(-1,1)), axis=1)

In [16]:
del features

In [17]:
clf = LogisticRegression(multi_class='ovr', solver='liblinear',
                        C=1, class_weight = priors).fit(X_t, y_train, sample_weight = None)
evaluate(clf, X_test, y_test, 3)

Top 3 accuracy 0.9337899543378996 


              precision    recall  f1-score   support

           1       0.73      0.76      0.74       412
          10       0.73      0.41      0.53       155
          11       0.75      0.78      0.76       344
          12       0.81      0.43      0.57        69
          13       0.82      0.81      0.81       314
          14       0.89      0.74      0.81       163
          15       0.83      0.67      0.74       144
           2       0.73      0.78      0.75       369
           3       0.88      0.90      0.89       403
           4       0.81      0.89      0.85       561
           5       0.81      0.88      0.84       649
           6       0.79      0.80      0.79       423
           7       0.77      0.83      0.80       422
           8       0.58      0.50      0.54       227
           9       0.73      0.60      0.66       163

    accuracy                           0.78      4818
   macro avg       0.78      0.72      0.74

# Random Forest

In [18]:
test_values = list(y_train.value_counts().values)
priors = {}
for key in y_train.value_counts().index:
    for value in test_values:
        priors[key] = value
        test_values.remove(value)
        break

In [19]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3, max_features = 72400,
                                stop_words='english', ngram_range=(1, 2), dtype = np.float32)

features = tfidf.fit_transform(list(X_train.text)).toarray()
X_t = np.concatenate((features, X_train.agreement.values.reshape(-1,1)), axis=1)
del features
clf = RandomForestClassifier(random_state=0, max_depth = 600, class_weight = priors).fit(X_t, y_train,
                                                                             sample_weight = None)
evaluate(clf, X_test, y_test, 3)

Top 3 accuracy 0.8858447488584474 


              precision    recall  f1-score   support

           1       0.67      0.72      0.69       412
          10       0.64      0.18      0.28       155
          11       0.69      0.73      0.71       344
          12       0.77      0.14      0.24        69
          13       0.72      0.71      0.72       314
          14       0.86      0.69      0.77       163
          15       0.87      0.56      0.68       144
           2       0.63      0.73      0.68       369
           3       0.82      0.85      0.84       403
           4       0.72      0.91      0.81       561
           5       0.77      0.86      0.81       649
           6       0.73      0.75      0.74       423
           7       0.69      0.82      0.75       422
           8       0.48      0.26      0.34       227
           9       0.68      0.33      0.45       163

    accuracy                           0.72      4818
   macro avg       0.72      0.62      0.63