In [5]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

import NER_sentiment

In [47]:
loaded_model = BertForSequenceClassification.from_pretrained('bias_bert_model').to(device)
loaded_tokenizer = BertTokenizer.from_pretrained('bias_bert_model')

In [44]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [67]:
def vectorize(claim):
    tokenized = loaded_tokenizer(claim, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    

    input_ids = tokenized['input_ids'].to(device)
    attention_mask = tokenized['attention_mask'].to(device)
    token_type_ids = tokenized['token_type_ids'].to(device) if 'token_type_ids' in tokenized else None
    

    with torch.no_grad():
        outputs = loaded_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits
        prediction= torch.argmax(logits, dim=-1).cpu()

    
    
    sentiment = NER_sentiment.get_sentiment(claim)
    
    return [prediction.item(), sentiment]

# Classification

In [95]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import os
import pandas as pd

In [80]:
current_directory = os.getcwd()

parent = os.path.join(current_directory, '..')

# Specify the file name in the parent directory
fp = os.path.join(parent, 'liar_plus/train2.tsv')

In [99]:
df = pd.read_csv(fp, delimiter='\t', header = None)

df = df.drop(columns = [0])


df.rename({1: 'id', 2: 'label', 3: 'statement', 4: 'subject', 5: 'speaker', 6: 'job-title',
           7: 'state_info', 8: 'party_affiliation', 9: 'barely_true_counts', 10: 'false_counts',
           11: 'half_true_counts', 12: 'mostly_true_counts', 13: 'pants_on_fire_counts', 14: 'context',
           15: 'justification'
          }, axis = 1, inplace = True)

df = df[~df['statement'].isna()]

uninformative = {'organization', 'newsmaker', 'activist', 'state-official', 'government-body',
'journalist', 'columnist', 'talk-show-host', 'education-official', 'business-leader', 
 'Moderate', 'democratic-farmer-labor', 'ocean-state-tea-party-action', 'none', 'labor-leader' }

df_bias = df[~df['party_affiliation'].isin(uninformative)]
df_bias = df_bias[~df_bias['party_affiliation'].isna()]

In [103]:
df_bias['party_affiliation'].replace({'republican': 0, 'democrat': 2, 'independent': 1, 'libertarian': 0,
       'tea-party-member': 0, 'green': 2, 'liberal-party-canada': 1,
       'constitution-party': 0}, inplace = True)

In [85]:
df['vectorized'] = df['statement'].apply(vectorize)

In [89]:
X, y = df['vectorized'].to_list(), df['label']


X_train, X_test, y_train, y_test = (
    train_test_split(X, y, test_size=.2)
)

In [96]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", #"Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]
         #"Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

# TODO (Apply): All cross-validation

max_score = 0.0
max_class = ''
# iterate over classifiers
for name, clf in zip(names, classifiers):
    
    print(clf)
    
    clf.fit(X_train, y_train)
    score = 100.0 * clf.score(X_test, y_test)
    print('Classifier = %s, Score (test, accuracy) = %.2f,' %(name, score))
    
    if score > max_score:
        clf_best = clf
        max_score = score
        max_class = name

print(80*'-' )
print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' %(max_class, max_score))

KNeighborsClassifier(n_neighbors=2)
Classifier = Nearest Neighbors, Score (test, accuracy) = 18.07,
SVC(C=0.025, kernel='linear')
Classifier = Linear SVM, Score (test, accuracy) = 20.17,
SVC(C=1, gamma=2)
Classifier = RBF SVM, Score (test, accuracy) = 21.39,
DecisionTreeClassifier(max_depth=5)
Classifier = Decision Tree, Score (test, accuracy) = 21.14,
RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10)
Classifier = Random Forest, Score (test, accuracy) = 22.71,
MLPClassifier(alpha=1, max_iter=1000)
Classifier = Neural Net, Score (test, accuracy) = 20.02,
AdaBoostClassifier()
Classifier = AdaBoost, Score (test, accuracy) = 21.44,
--------------------------------------------------------------------------------
Best --> Classifier = Random Forest, Score (test, accuracy) = 22.71


# Different affiliation model

In [105]:
X, y = df_bias['statement'], df_bias['party_affiliation']


X_train, X_test, y_train, y_test = (
    train_test_split(X, y, test_size=.2)
)

tfidf_bias = TfidfVectorizer()
X_train = tfidf_bias.fit_transform(X_train)

X_test = tfidf_bias.transform(X_test)


In [106]:
bias_model = SVC(gamma=2, C=1)
bias_model.fit(X_train, y_train)

SVC(C=1, gamma=2)

In [108]:
(bias_model.predict(X_test) == y_test).mean()

0.6509023024268824

In [114]:
def vectorize_2(claim):
    tokenized = tfidf_bias.transform([claim])
    
    prediction = bias_model.predict(tokenized)

    sentiment = NER_sentiment.get_sentiment(claim)
    
    return [prediction, sentiment]

In [119]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", #"Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]
         #"Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

df['vectorized_2'] = df['statement'].apply(vectorize_2)

X, y = df['vectorized_2'].to_list(), df['label']


X_train, X_test, y_train, y_test = (
    train_test_split(X, y, test_size=.2)
)

max_score = 0.0
max_class = ''
# iterate over classifiers
for name, clf in zip(names, classifiers):
    
    print(clf)
    
    clf.fit(X_train, y_train)
    score = 100.0 * clf.score(X_test, y_test)
    print('Classifier = %s, Score (test, accuracy) = %.2f,' %(name, score))
    
    if score > max_score:
        clf_best = clf
        max_score = score
        max_class = name

print(80*'-' )
print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' %(max_class, max_score))

KNeighborsClassifier(n_neighbors=2)
Classifier = Nearest Neighbors, Score (test, accuracy) = 19.82,
SVC(C=0.025, kernel='linear')


  array = np.asarray(array, order=order, dtype=dtype)
  return f(*args, **kwargs)
  array = np.asarray(array, order=order, dtype=dtype)
  return f(*args, **kwargs)


Classifier = Linear SVM, Score (test, accuracy) = 21.39,
SVC(C=1, gamma=2)
Classifier = RBF SVM, Score (test, accuracy) = 21.58,
DecisionTreeClassifier(max_depth=5)
Classifier = Decision Tree, Score (test, accuracy) = 21.24,
RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10)
Classifier = Random Forest, Score (test, accuracy) = 21.19,
MLPClassifier(alpha=1, max_iter=1000)


  array = np.asarray(array, order=order, dtype=dtype)
  return f(*args, **kwargs)
  array = np.asarray(array, order=order, dtype=dtype)


Classifier = Neural Net, Score (test, accuracy) = 21.14,
AdaBoostClassifier()
Classifier = AdaBoost, Score (test, accuracy) = 21.00,
--------------------------------------------------------------------------------
Best --> Classifier = RBF SVM, Score (test, accuracy) = 21.58


  array = np.asarray(array, order=order, dtype=dtype)
