In [1]:
import numpy as np
import pandas as pd 
from nltk.corpus import stopwords

from sklearn.model_selection import ShuffleSplit, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, \
                            accuracy_score, top_k_accuracy_score
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
names = ['doi', 'text_id', 'text', 'sdg', 'labels_negative', 
         'labels_positive', 'agreement']

# laod data
df = pd.read_csv('osdg.csv', error_bad_lines=False,
                sep="\t", header= None, names=names)
df = df.iloc[1:, :] 
df = df.drop(['doi', 'text_id', 'labels_negative', 
              'labels_positive'], axis = 1)

priors = list(df.sdg.value_counts().values)

# define discretization
bins = [-1, 0.2, 0.4, 0.6, 0.8, 2]
labels = np.linspace(1,10,5)

# develop agreement variable
df.agreement = df.agreement.astype(np.float16)
df['cat_agreement'] = pd.cut(df.agreement, bins=bins, 
       labels = labels)

X_train, X_subset, \
y_train, y_subset  = train_test_split(df[['text', 'agreement', 'cat_agreement']], df.sdg,
                               test_size=0.3, random_state=0, 
                               stratify=df[['sdg', 'cat_agreement']])
X_val, X_test, \
y_val, y_test  = train_test_split(X_subset, y_subset,
                               test_size=0.5, random_state=0, 
                               stratify=pd.concat((X_subset, y_subset), axis = 1)[['sdg', 'cat_agreement']])

In [3]:
del df, X_subset, y_subset

# MNB

In [5]:
sublinear_df = [True, False]
min_df = [1, 2, 3, 4, 5]
alphas = [1e-1, 1e-2, 1e-3, 0]
cases = ['equal sample weights, no agreement', 
         'equal sample weights, agreement as feature',
        'agreement as sample weights']
labels = y_train.unique()

In [5]:
total_iters = len(sublinear_df)*len(min_df)*len(alphas)*len(cases)

In [6]:
acc = 0
counter = 1
for sublin_bool in sublinear_df:
    for min_docs in min_df:
        
        tfidf = TfidfVectorizer(sublinear_tf=sublin_bool, min_df=min_docs, max_features = 72400,
                                stop_words='english', ngram_range=(1, 2), dtype = np.float32)
        features = tfidf.fit_transform(list(X_train.text)).toarray()
        
        ##########
        # 1. equal sample weights, no agreement as feature
        # 2. equal sample weights, agreement as feature
        # 3. agreement as sample weights, no agreement as feature
        
        for alpha in alphas:
            for case in cases:
                
                if case == cases[0]:
                    ### training
                    X_t = features
                            
                    clf = MultinomialNB(alpha = alpha, 
                                    fit_prior=True, 
                                    class_prior=priors).fit(X_t, y_train,
                                                    sample_weight = None)
                    del X_t
                    ### validation
                    X_v = tfidf.transform(list(X_val.text)).toarray()
                    y_pred = clf.predict(X_v)
                    probs = clf.predict_proba(X_v)
                    del X_v
                    
                elif case == cases[1]:
                    ### training
                    X_t = np.concatenate((features, X_train.agreement.values.reshape(-1,1)), axis=1)

                    clf = MultinomialNB(alpha = alpha, 
                                    fit_prior=True, 
                                    class_prior=priors).fit(X_t, y_train,
                                                    sample_weight = None)
                    del X_t
                    ### validation
                    features_val = tfidf.transform(list(X_val.text)).toarray()
                    X_v = np.concatenate((features_val, X_val.agreement.values.reshape(-1,1)), axis=1)
                    y_pred = clf.predict(X_v)
                    probs = clf.predict_proba(X_v)
                    del X_v
                    
                elif case == case[2]:
                    ### training
                    X_t = features
                    sample_weights = X_train.agreement.values.reshape(-1,1)
                            
                    clf = MultinomialNB(alpha = alpha, 
                                    fit_prior=True, 
                                    class_prior=priors).it(X, Y,
                                                    sample_weight = X_train.agreement.values.reshape(-1,1))
                    del X_t
                    ### validation
                    X_v = tfidf.transform(list(X_val.text)).toarray()
                    y_pred = clf.predict(X_v)
                    probs = clf.predict_proba(X_v)
                    del X_v
            
                accuracy = accuracy_score(y_val, y_pred)
                top_k = top_k_accuracy_score(y_val, probs, k=3)
                
                if counter%5 == 0:
                    print(f'Iteration: {counter}/{total_iters}')
                counter=counter+1
                      
                if accuracy > acc:
                    acc = accuracy
                    top_k_acc = top_k
                    best_params = {'alpha': alpha, 
                                   'sublinear_df': sublin_bool, 
                                   'min_df': min_docs, 'case':case}
                    
print(f'Best validation accuracy: {acc}\
        \nTop 3 accuracy %s {top_k_acc }\
        \nBest parameters: {best_params}')

Iteration: 5/120
Iteration: 10/120
Iteration: 15/120
Iteration: 20/120
Iteration: 25/120
Iteration: 30/120
Iteration: 35/120
Iteration: 40/120
Iteration: 45/120
Iteration: 50/120
Iteration: 55/120
Iteration: 60/120
Iteration: 65/120
Iteration: 70/120
Iteration: 75/120
Iteration: 80/120
Iteration: 85/120
Iteration: 90/120
Iteration: 95/120
Iteration: 100/120
Iteration: 105/120
Iteration: 110/120
Iteration: 115/120
Iteration: 120/120
Best validation accuracy: 0.7501037775010377        
Top 3 accuracy %s 0.9113740141137402        
Best parameters: {'alpha': 0.01, 'sublinear_df': False, 'min_df': 2, 'case': 'equal sample weights, agreement as feature'}


In [7]:
del tfidf, features, clf, y_pred, accuracy, best_params

# SVM

In [8]:
C_vals = [0.01, 0.1, 1]
total_iters = len(sublinear_df)*len(min_df)*len(C_vals)*len(cases)

In [9]:
test_values = list(y_train.value_counts().values)
priors = {}
for key in y_train.value_counts().index:
    for value in test_values:
        priors[key] = value
        test_values.remove(value)
        break

In [10]:
acc = 0
counter = 1
for sublin_bool in sublinear_df:
    for min_docs in min_df:
        
        tfidf = TfidfVectorizer(sublinear_tf=sublin_bool, min_df=min_docs, max_features = 72400,
                                stop_words='english', ngram_range=(1, 2), dtype = np.float32)
        features = tfidf.fit_transform(list(X_train.text)).toarray()
        
        ##########
        # 1. equal sample weights, no agreement as feature
        # 2. equal sample weights, agreement as feature
        # 3. agreement as sample weights, no agreement as feature
        
        for C in C_vals:
            for case in cases:
                
                if case == cases[0]:
                    ### training
                    X_t = features
                    clf = LinearSVC(C=C, class_weight=priors).fit(X_t, y_train,
                                                                         sample_weight = None)
                    del X_t
                    ### validation
                    X_v = tfidf.transform(list(X_val.text)).toarray()
                    y_pred = clf.predict(X_v)
                    del X_v
                    
                elif case == cases[1]:
                    ### training
                    X_t = np.concatenate((features, X_train.agreement.values.reshape(-1,1)), axis=1)

                    clf = LinearSVC(C=C, class_weight=priors).fit(X_t, y_train,
                                                                         sample_weight = None)
                    del X_t
                    ### validation
                    features_val = tfidf.transform(list(X_val.text)).toarray()
                    X_v = np.concatenate((features_val, X_val.agreement.values.reshape(-1,1)), axis=1)
                    y_pred = clf.predict(X_v)
                    del X_v
                    
                elif case == case[2]:
                    ### training
                    X_t = features
                    sample_weights = X_train.agreement.values.reshape(-1,1)
                            
                    clf = LinearSVC(C=C, class_weight=priors).fit(X_t, y_train,
                                                                         sample_weight = X_train.agreement.values.reshape(-1,1))
                    del X_t
                    ### validation
                    X_v = tfidf.transform(list(X_val.text)).toarray()
                    y_pred = clf.predict(X_v)
                    del X_v
                
                accuracy = accuracy_score(y_val, y_pred)
                
                if counter%5 == 0:
                    print(f'Iteration: {counter}/{total_iters}')
                counter=counter+1
                      
                if accuracy > acc:
                    acc = accuracy
                    best_params = {'C': C, 'sublinear_df': sublin_bool, 'min_df': min_docs, 'case':case}
                    
print(f'Best validation accuracy: {acc}\n Best parameters: {best_params}')

Iteration: 5/90
Iteration: 10/90
Iteration: 15/90
Iteration: 20/90
Iteration: 25/90
Iteration: 30/90
Iteration: 35/90
Iteration: 40/90
Iteration: 45/90
Iteration: 50/90
Iteration: 55/90
Iteration: 60/90
Iteration: 65/90
Iteration: 70/90
Iteration: 75/90
Iteration: 80/90
Iteration: 85/90
Iteration: 90/90
Best validation accuracy: 0.79244499792445
 Best parameters: {'C': 1, 'sublinear_df': True, 'min_df': 2, 'case': 'equal sample weights, agreement as feature'}


In [11]:
del tfidf, features, clf, y_pred, accuracy, best_params

# Log regression

In [30]:
acc = 0
counter = 1
for sublin_bool in sublinear_df:
    for min_docs in min_df:
        
        tfidf = TfidfVectorizer(sublinear_tf=sublin_bool, min_df=min_docs, max_features = 72400,
                                stop_words='english', ngram_range=(1, 2), dtype = np.float32)
        features = tfidf.fit_transform(list(X_train.text)).toarray()
        
        ##########
        # 1. equal sample weights, no agreement as feature
        # 2. equal sample weights, agreement as feature
        # 3. agreement as sample weights, no agreement as feature
        
        for C in C_vals:
            for case in cases:
                
                if case == cases[0]:
                    ### training
                    X_t = features
                    clf = LogisticRegression(multi_class='ovr', solver='liblinear',
                             C=C, class_weight = priors).fit(X_t, y_train,
                                                                         sample_weight = None)
                    del X_t
                    ### validation
                    X_v = tfidf.transform(list(X_val.text)).toarray()
                    y_pred = clf.predict(X_v)
                    probs = clf.predict_proba(X_v)
                    del X_v
                    
                elif case == cases[1]:
                    ### training
                    X_t = np.concatenate((features, X_train.agreement.values.reshape(-1,1)), axis=1)

                    clf = LogisticRegression(multi_class='ovr', solver='liblinear',
                             C=C, class_weight = priors).fit(X_t, y_train,
                                                                         sample_weight = None)
                    del X_t
                    ### validation
                    features_val = tfidf.transform(list(X_val.text)).toarray()
                    X_v = np.concatenate((features_val, X_val.agreement.values.reshape(-1,1)), axis=1)
                    y_pred = clf.predict(X_v)
                    probs = clf.predict_proba(X_v)
                    del X_v
                    
                elif case == case[2]:
                    ### training
                    X_t = features
                    sample_weights = X_train.agreement.values.reshape(-1,1)
                            
                    clf = LogisticRegression(multi_class='ovr', solver='liblinear',
                             C=C, class_weight = priors).fit(X_t, y_train, sample_weight = X_train.agreement.values.reshape(-1,1))
                    del X_t
                    ### validation
                    X_v = tfidf.transform(list(X_val.text)).toarray()
                    y_pred = clf.predict(X_v)
                    probs = clf.predict_proba(X_v)
                    del X_v
                
                accuracy = accuracy_score(y_val, y_pred)
                top_k = top_k_accuracy_score(y_val, probs, k=3)
                
                if counter%5 == 0:
                    print(f'Iteration: {counter}/{total_iters}')
                counter=counter+1
                      
                if accuracy > acc:
                    acc = accuracy
                    top_k_acc = top_k
                    best_params = {'C': C, 'sublinear_df': sublin_bool, 'min_df': min_docs, 'case':case}
                    
print(f'Best validation accuracy: {acc}\
        \nTop 3 accuracy %s {top_k_acc }\
        \nBest parameters: {best_params}')

Iteration: 5/36
Iteration: 10/36
Iteration: 15/36
Iteration: 20/36
Iteration: 25/36
Iteration: 30/36
Iteration: 35/36
Iteration: 40/36
Iteration: 45/36
Iteration: 50/36
Best validation accuracy: 0.7831050228310502        
Top 3 accuracy %s 0.9315068493150684        
Best parameters: {'C': 1, 'sublinear_df': False, 'min_df': 1, 'case': 'equal sample weights, agreement as feature'}


In [31]:
del tfidf, features, clf, y_pred, accuracy, best_params

# Random Forest

In [6]:
trees = [100]
max_depths = [300, 600]
sublinear_df = [True, False]
min_df = [1, 3, 5]

total_iters = len(sublinear_df)*len(min_df)*len(trees)*len(cases)*len(max_depths)

In [7]:
test_values = list(y_train.value_counts().values)
priors = {}
for key in y_train.value_counts().index:
    for value in test_values:
        priors[key] = value
        test_values.remove(value)
        break

In [None]:
acc = 0
counter = 1
for sublin_bool in sublinear_df:
    for min_docs in min_df:
        
        tfidf = TfidfVectorizer(sublinear_tf=sublin_bool, min_df=min_docs, max_features = 72400,
                                stop_words='english', ngram_range=(1, 2), dtype = np.float32)
        features = tfidf.fit_transform(list(X_train.text)).toarray()
        
        ##########
        # 1. equal sample weights, no agreement as feature
        # 2. equal sample weights, agreement as feature
        # 3. agreement as sample weights, no agreement as feature
        
        for n_estimators in trees:
            for max_depth in max_depths:
                for case in cases:

                    if case == 'equal sample weights, no agreement':
                        ### training
                        X_t = features
                        clf = RandomForestClassifier(n_estimators=n_estimators, random_state=0, max_depth = max_depth, class_weight = priors).fit(X_t, y_train,
                                                                             sample_weight = None)
                        del X_t
                        ### validation
                        X_v = tfidf.transform(list(X_val.text)).toarray()
                        y_pred = clf.predict(X_v)
                        probs = clf.predict_proba(X_v)
                        del X_v

                    if case == 'equal sample weights, agreement as feature':
                        ### training
                        X_t = np.concatenate((features, X_train.agreement.values.reshape(-1,1)), axis=1)

                        clf = RandomForestClassifier(n_estimators=n_estimators, random_state=0,  max_depth = max_depth, class_weight = priors).fit(X_t, y_train,
                                                                             sample_weight = None)
                        del X_t
                        ### validation
                        features_val = tfidf.transform(list(X_val.text)).toarray()
                        X_v = np.concatenate((features_val, X_val.agreement.values.reshape(-1,1)), axis=1)
                        y_pred = clf.predict(X_v)
                        probs = clf.predict_proba(X_v)
                        del X_v

                    if case == 'agreement as sample weights':
                        ### training
                        X_t = features
                        sample_weights = X_train.agreement.values.reshape(-1,1)

                        clf = RandomForestClassifier(n_estimators=n_estimators, random_state=0,  max_depth = max_depth, class_weight = priors).fit(X_t, y_train,
                                                                             sample_weight = X_train.agreement.values)
                        del X_t
                        ### validation
                        X_v = tfidf.transform(list(X_val.text)).toarray()
                        y_pred = clf.predict(X_v)
                        probs = clf.predict_proba(X_v)
                        del X_v

                    accuracy = accuracy_score(y_val, y_pred)
                    top_k = top_k_accuracy_score(y_val, probs, k=3)

                    if accuracy > acc:
                        acc = accuracy
                        top_k_acc = top_k
                        best_params = {'trees': n_estimators, 'sublinear_df': sublin_bool, 
                                       'min_df': min_docs, 'case':case, 'max depth': max_depth}

                    if counter%5 == 0:
                        print(f'Iteration: {counter}/{total_iters}\nBest accuracy:{acc}')

                    counter=counter+1
                    
print(f'Best validation accuracy: {acc}\
        \nTop 3 accuracy %s {top_k_acc }\
        \nBest parameters: {best_params}')

Iteration: 5/36
Best accuracy:0.7141967621419676
Iteration: 10/36
Best accuracy:0.7141967621419676
Iteration: 15/36
Best accuracy:0.7185554171855542
Iteration: 20/36
Best accuracy:0.7185554171855542
Iteration: 25/36
Best accuracy:0.7185554171855542
