In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import TextClassificationPipeline

In [25]:
dfo = pd.read_csv('data/obama_cleaned.csv')
dfo = dfo.rename(columns={'tweets' : 'text', 'class' : 'label'})
dfr = pd.read_csv('data/romney_cleaned.csv')
dfr = dfr.rename(columns={'tweets' : 'text', 'class' : 'label'})
df = pd.concat([dfo, dfr], ignore_index = True)
df.info()
pd.set_option('display.max_rows', None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11271 non-null  object
 1   label   11271 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 176.2+ KB


In [26]:
df = df.astype({'text' : 'string'})

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11271 non-null  string
 1   label   11271 non-null  int64 
dtypes: int64(1), string(1)
memory usage: 176.2 KB


# Pre-trained model: BERTweet

Fine-tuning using our data:

In [107]:
Xo = dfo['text']
yo = dfo['label']
Xo, Xo_test, yo, yo_test = train_test_split(Xo, yo, test_size = 0.2)
Xo_train, Xo_eval, yo_train, yo_eval = train_test_split(Xo, yo, test_size = 0.25)
Xr = dfr['text']
yr = dfr['label']
Xr, Xr_test, yr, yr_test = train_test_split(Xr, yr, test_size = 0.2)
Xr_train, Xr_eval, yr_train, yr_eval = train_test_split(Xo, yo, test_size = 0.25)


traindf_o = pd.concat([Xo_train,yo_train], axis = 1)
traindf_r = pd.concat([Xr_train,yr_train], axis = 1)
evaldf_o = pd.concat([Xo_eval,yo_eval], axis = 1)
evaldf_r = pd.concat([Xr_eval,yr_eval], axis = 1)
testdf_o = pd.concat([Xo_test,yo_test], axis = 1)
testdf_r = pd.concat([Xr_test,yr_test], axis = 1)


 0    405
-1    404
 1    316
Name: label, dtype: int64

In [53]:
train_o = Dataset.from_pandas(traindf_o, split = 'train')
eval_o = Dataset.from_pandas(evaldf_o, split = 'eval')
test_o = Dataset.from_pandas(testdf_o, split = 'test')
train_r = Dataset.from_pandas(traindf_r, split = 'train')
eval_r = Dataset.from_pandas(evaldf_r, split = 'eval')
test_r = Dataset.from_pandas(testdf_r, split = 'test')


In [54]:
tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_train_o = train_o.map(tokenize_function, batched=True)
tokenized_eval_o = eval_o.map(tokenize_function, batched=True)
tokenized_test_o = test_o.map(tokenize_function, batched=True)
tokenized_train_r = train_r.map(tokenize_function, batched=True)
tokenized_eval_r = eval_o.map(tokenize_function, batched=True)
tokenized_test_r = test_r.map(tokenize_function, batched=True)

Map:   0%|          | 0/3374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

Map:   0%|          | 0/3374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

Map:   0%|          | 0/1130 [00:00<?, ? examples/s]

1182

In [56]:
model_o = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis", num_labels=3)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_o = TrainingArguments(output_dir="checkpoints/test_trainer_o", evaluation_strategy="epoch", num_train_epochs=8)
trainer_o = Trainer(
    model=model_o,
    args=training_args_o,
    train_dataset=tokenized_train_o,
    eval_dataset=tokenized_eval_o,
    compute_metrics=compute_metrics,
)

trainer_o.train()

{'eval_loss': 0.6172908544540405, 'eval_accuracy': 0.4444444444444444, 'eval_runtime': 11.7037, 'eval_samples_per_second': 96.123, 'eval_steps_per_second': 12.047, 'epoch': 1.0}
{'loss': 0.3784, 'learning_rate': 4.259478672985782e-05, 'epoch': 1.18}
{'eval_loss': 0.39306536316871643, 'eval_accuracy': 0.512, 'eval_runtime': 11.4746, 'eval_samples_per_second': 98.043, 'eval_steps_per_second': 12.288, 'epoch': 2.0}
{'loss': 0.2631, 'learning_rate': 3.518957345971564e-05, 'epoch': 2.37}
{'eval_loss': 0.4201142489910126, 'eval_accuracy': 0.5164444444444445, 'eval_runtime': 11.4244, 'eval_samples_per_second': 98.474, 'eval_steps_per_second': 12.342, 'epoch': 3.0}
{'loss': 0.1607, 'learning_rate': 2.778436018957346e-05, 'epoch': 3.55}
{'eval_loss': 0.538033664226532, 'eval_accuracy': 0.5137777777777778, 'eval_runtime': 11.3675, 'eval_samples_per_second': 98.967, 'eval_steps_per_second': 12.404, 'epoch': 4.0}
{'loss': 0.0858, 'learning_rate': 2.037914691943128e-05, 'epoch': 4.74}
{'eval_loss':

TrainOutput(global_step=3376, training_loss=0.14492685506694125, metrics={'train_runtime': 1323.6464, 'train_samples_per_second': 20.392, 'train_steps_per_second': 2.551, 'train_loss': 0.14492685506694125, 'epoch': 8.0})

In [58]:
trainer_o.save_model('models/model_obama')

In [57]:
model_r = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis", num_labels=3)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_r = TrainingArguments(output_dir="checkpoints/test_trainer_r", evaluation_strategy="epoch", num_train_epochs=8)
trainer_r = Trainer(
    model=model_r,
    args=training_args_r,
    train_dataset=tokenized_train_r,
    eval_dataset=tokenized_eval_r,
    compute_metrics=compute_metrics,
)

trainer_r.train()


{'eval_loss': 0.25792068243026733, 'eval_accuracy': 0.5466666666666666, 'eval_runtime': 11.2133, 'eval_samples_per_second': 100.327, 'eval_steps_per_second': 12.574, 'epoch': 1.0}
{'loss': 0.3632, 'learning_rate': 4.259478672985782e-05, 'epoch': 1.18}
{'eval_loss': 0.20618405938148499, 'eval_accuracy': 0.5724444444444444, 'eval_runtime': 11.3791, 'eval_samples_per_second': 98.866, 'eval_steps_per_second': 12.391, 'epoch': 2.0}
{'loss': 0.25, 'learning_rate': 3.518957345971564e-05, 'epoch': 2.37}
{'eval_loss': 0.2009427696466446, 'eval_accuracy': 0.5884444444444444, 'eval_runtime': 11.6067, 'eval_samples_per_second': 96.927, 'eval_steps_per_second': 12.148, 'epoch': 3.0}
{'loss': 0.1604, 'learning_rate': 2.778436018957346e-05, 'epoch': 3.55}
{'eval_loss': 0.18442708253860474, 'eval_accuracy': 0.6088888888888889, 'eval_runtime': 11.3957, 'eval_samples_per_second': 98.722, 'eval_steps_per_second': 12.373, 'epoch': 4.0}
{'loss': 0.0774, 'learning_rate': 2.037914691943128e-05, 'epoch': 4.74

TrainOutput(global_step=3376, training_loss=0.13952911267348375, metrics={'train_runtime': 1330.9679, 'train_samples_per_second': 20.28, 'train_steps_per_second': 2.537, 'train_loss': 0.13952911267348375, 'epoch': 8.0})

In [59]:
trainer_r.save_model('models/model_romney')

# Load from checkpoints

In [None]:
model_o = AutoModelForSequenceClassification.from_pretrained("checkpoints/test_trainer_o", num_labels=3)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_o = TrainingArguments(output_dir="test_trainer_o", evaluation_strategy="epoch", num_train_epochs=5)
trainer_o = Trainer(
    model=model_o,
    args=training_args_o,
    train_dataset=tokenized_train_o,
    eval_dataset=tokenized_test_o,
    compute_metrics=compute_metrics,
)

trainer_o.train()

In [None]:
model_r = AutoModelForSequenceClassification.from_pretrained("checkpoints/test_trainer_r", num_labels=3)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_r = TrainingArguments(output_dir="checkpoints/test_trainer_r", evaluation_strategy="epoch", num_train_epochs=8)
trainer_r = Trainer(
    model=model_r,
    args=training_args_r,
    train_dataset=tokenized_train_r,
    eval_dataset=tokenized_eval_r,
    compute_metrics=compute_metrics,
)

trainer_r.train()

# Load finetuned models

In [92]:
model_o = AutoModelForSequenceClassification.from_pretrained('models/model_obama')
model_r = AutoModelForSequenceClassification.from_pretrained('models/model_romney')

In [124]:

pipe_o = TextClassificationPipeline(model=model_o, tokenizer=tokenizer)
pipe_r = TextClassificationPipeline(model=model_o, tokenizer=tokenizer)


In [111]:
pos = list()
neg = list()
neu = list()
pred_o = pd.DataFrame()
for t in testdf_o['text']:
    pred = pipe_o(t, top_k=None)
    for l in pred:
        if l['label'] == 'POS':
            pos.append(l['score'])
        elif l['label'] == 'NEG':
            neg.append(l['score'])
        else: 
            neu.append(l['score'])

pred_o['pos'] = pos
pred_o['neg'] = neg
pred_o['neu'] = neu


In [125]:
pos = list()
neg = list()
neu = list()
pred_r = pd.DataFrame()
for t in testdf_r['text']:
    pred = pipe_r(t, top_k=None)
    for l in pred:
        if l['label'] == 'POS':
            pos.append(l['score'])
        elif l['label'] == 'NEG':
            neg.append(l['score'])
        else: 
            neu.append(l['score'])

pred_r['pos'] = pos
pred_r['neg'] = neg
pred_r['neu'] = neu


In [119]:
pred_o['class'] = list(yo_test)


Unnamed: 0,pos,neg,neu,pred,class
0,5.5e-05,0.99948,0.000465,-1,-1
1,0.000136,0.000239,0.999625,0,1
2,2.3e-05,0.99925,0.000727,-1,-1
3,9.5e-05,0.999494,0.000412,-1,0
4,1e-05,0.00114,0.998849,0,1
5,5e-06,0.004108,0.995888,0,-1
6,2.9e-05,0.999301,0.00067,-1,-1
7,3.3e-05,0.999264,0.000703,-1,0
8,8.8e-05,0.999487,0.000424,-1,0
9,7e-06,0.99742,0.002573,-1,-1


In [126]:
pred_r['class'] = list(yr_test)

# Predict label using maximum probability


In [122]:
def pred_label(df):
    preds = list()
    for idx, row in df.iterrows():
        if row['pos'] >= row['neu'] and row['pos'] >= row['neg']:
            preds.append(1)
        elif row['neg'] >= row['neu'] and row['neg'] > row['pos']:
            preds.append(-1)
        elif row['neu'] > row['pos'] and row['neu'] > row['neg']:
            preds.append(0)
    df['pred'] = preds
    acc = accuracy_score(df['class'], df['pred'])
    prec = precision_score(df['class'], df['pred'], average = None, zero_division = np.nan)
    rec = recall_score(df['class'], df['pred'], average = None)
    f1 = f1_score(df['class'], df['pred'], average = None)
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1:", f1)
    return df

In [123]:
predicted_labels_o = pred_label(pred_o)


Accuracy: 0.29333333333333333
Precision: [0.44358578 0.08995816        nan]
Recall: [0.7175     0.11345646 0.        ]
F1: [0.54823305 0.10035006 0.        ]


In [127]:
predicted_labels_r = pred_label(pred_r)

Accuracy: 0.4451327433628319
Precision: [0.56387097 0.18591549        nan]
Recall: [0.7319933  0.19760479 0.        ]
F1: [0.63702624 0.191582   0.        ]


In [130]:
predicted_labels_r['pred'].value_counts()

-1    775
 0    355
Name: pred, dtype: int64

# Predict label using ML models

In [None]:
def test_model(model, parameters, X, y, n_splits):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state = 27)
    avg_accuracies = list()
    avg_precisions = list()
    avg_recalls = list()
    avg_f1s = list()
    confs = list()
    for conf in ParameterGrid(parameters):
        print('Testing', conf)
        accuracies = list()
        precisions = list()
        recalls = list()
        f1s = list()
        i = 1
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            try:
                model.set_params(**conf)
                model.fit(X_train, y_train)
            except:
                print('Skipped', conf)
                break
            print('\tFold', i, 'of', n_splits)
            y_pred = model.predict(X_test)
            accuracies.append(accuracy_score(y_test, y_pred))
            precisions.append(precision_score(y_test, y_pred, average=None, zero_division = np.nan))
            recalls.append(recall_score(y_test, y_pred, average=None, zero_division = np.nan))
            f1s.append(f1_score(y_test, y_pred, average=None, zero_division = np.nan))
            if i == 1:
                confs.append(conf)
            i = i + 1
            
    
        if len(accuracies) != 0:  
            avg_accuracies.append(sum(accuracies)/len(accuracies))
            avg_precisions.append((sum(precisions)/len(precisions)) if len(precisions) > 0 else np.nan)
            avg_recalls.append(sum(recalls)/len(recalls) if len(recalls) > 0 else np.nan)
            avg_f1s.append(sum(f1s)/len(f1s) if len(f1s) > 0 else np.nan)
        
    results = {'Parameters' : confs,
              'Accuracy' : avg_accuracies,
              'Precision' : avg_precisions,
              'Recall' : avg_recalls,
              'F1' : avg_f1s}
    
    return pd.DataFrame.from_dict(results), model.classes_
    
        
        

In [None]:
X = df[['pos', 'neg', 'neu']]
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
y.value_counts()

In [None]:
params_svm = {'C' : (0.1, 1, 10, 100),
             'kernel' : ('rbf', 'poly', 'linear'),
             'degree' : (3, 5, 7),
             'gamma' : ('scale', 'auto')}
svm = SVC()
svm_results, cl_svm = test_model(svm, params_svm, X_train, y_train, 4)

In [None]:
svm_results

In [None]:
cl_svm

In [None]:
params_rf = {'n_estimators' : (50, 100, 150),
            'criterion' : ('entropy', 'gini'),
            'max_features' : (None, 'sqrt')}
rf = RandomForestClassifier()
rf_results, cl_rf = test_model(rf, params_rf, X_train, y_train, 4)

In [None]:
rf_results

In [None]:
cl_rf

In [None]:
params_knn = {'n_neighbors' : (1, 3, 5, 7, 9),
             'metric' : ('minkowski', 'euclidean', 'manhattan', 'cosine')}
knn = KNeighborsClassifier()
knn_results, cl_knn = test_model(knn, params_knn, X_train, y_train, 4)

In [None]:
knn_results

In [None]:
cl_knn

In [None]:
params_lr = {'penalty' : ('l1', 'l2', 'elasticnet', None),
            'C' : (0.1, 1, 10, 100),
            'solver' : ('lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'),
            'max_iter' : [500]}

lr = LogisticRegression()
lr_results, cl_lr = test_model(lr, params_lr, X_train, y_train, 4)

In [None]:
lr_results

In [None]:
cl_lr