In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import os
from sklearn import *
import time
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, classification_report, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax


In [9]:
custom_stopwords = [
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 
    'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 
    'can', 'could','did', 'do', 'does', 'doing', 'down', 'during', 
    'each', 'few', 'for', 'from', 'further', 
    'had', 'has', 'have', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 
    'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 
    'me', 'more', 'most', 'my', 'myself', 
    'nor', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 
    'same', 'she', 'should', 'so', 'some', 'such', 
    'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', 
    'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 
    'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 
    'you', 'your', 'yours', 'yourself', 'yourselves'
]

def pre_processing(tweets_raw):
	cleanTweets = []
	for tweet in tweets_raw:
		tweet = str(tweet).lower() #convert to lowercase
		tweet = re.sub(r"let\'s'", "lets", tweet)
		tweet = re.sub(r"won\'t", "will not", tweet)
		tweet = re.sub(r"can\'t", "can not", tweet)
		tweet = re.sub(r"n\'t", " not", tweet)
		tweet = re.sub(r"\'re", " are", tweet)
		tweet = re.sub(r"\'s", " is", tweet)
		tweet = re.sub(r"\'d", " would", tweet)
		tweet = re.sub(r"\'ll", " will", tweet)
		tweet = re.sub(r"\'t", " not", tweet)
		tweet = re.sub(r"\'ve", " have", tweet)
		tweet = re.sub(r"\'d", " had", tweet)
		tweet = re.sub(r"\'m", " am", tweet)
		tweet = re.sub(r"-", " ", tweet)
		tweet = re.sub(r'https?://\S+|www\.\S+', '', tweet)
		tweet = re.sub(r'@\w+', '', tweet)
		tweet = re.sub('<[^<]+?>', '', tweet) #remove HTML tags
		tweet = re.sub(r'[<>!#@$:.,%?\-]+', r'', tweet) #remove punctuation and special characters 
		tweet = tweet.replace(" rt ", "")
		words = tweet.split()
# 		words = [word for word in words if (word in spell or not spell.unknown([word]) or word == 'obama') and word.isalpha()]
		lemmatizer = WordNetLemmatizer()
		words = [lemmatizer.lemmatize(word) for word in words if word not in custom_stopwords and word.isalpha()]
		cleanTweet = ' '.join(words).replace("'", "").replace('“', "").replace('"', "")
		cleanTweets.append(cleanTweet)
	return cleanTweets

In [19]:
training_file_obama = "Obama_Training_Data_Preprocessed.csv"
training_file_romney = "Romney_Training_Data_Preprocessed.csv"
df_obama = pd.read_csv(training_file_obama)
df_romney = pd.read_csv(training_file_romney)

d={'Neutral':0,'Positive':1,'Negative':1}

df_obama.columns = ['id','RawTweet','Class']
df_romney.columns = ['id','RawTweet','Class']
df_obama['Class'] = df_obama['Class'].map(d)
df_romney['Class'] = df_romney['Class'].map(d)

obama_tweets_raw = df_obama['RawTweet']
obama_class = df_obama['Class']
romney_tweets_raw = df_romney['RawTweet']
romney_class = df_romney['Class']

obama_tweets = obama_tweets_raw.tolist()
romney_tweets = romney_tweets_raw.tolist()
obama_class_train = obama_class.tolist()
romney_class_train = romney_class.tolist()

# romney_tweets = pre_processing(romney_tweets_raw) 
# obama_tweets = pre_processing(obama_tweets_raw) 

In [20]:
tfidf_vectorizer_obama = TfidfVectorizer()

X_obama = tfidf_vectorizer_obama.fit_transform(obama_tweets)  # Fit and transform Obama's training data

clf_obama = MultinomialNB()
clf_obama.fit(X_obama, obama_class_train)

In [21]:
pipeline = make_pipeline(tfidf_vectorizer_obama, clf_obama)

# Perform 5-fold cross-validation
# Here 'obama_class_train' should be the labels for your training data
scores = cross_validate(pipeline, obama_tweets, obama_class_train, cv=5,
                        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
                        return_train_score=False)

# Print the scores
print("OBAMA MODEL")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores['test_precision_macro'].mean(), scores['test_precision_macro'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores['test_recall_macro'].mean(), scores['test_recall_macro'].std() * 2))
print("F1 Score: %0.2f (+/- %0.2f)" % (scores['test_f1_macro'].mean(), scores['test_f1_macro'].std() * 2))

OBAMA MODEL
Accuracy: 0.67 (+/- 0.01)
Precision: 0.68 (+/- 0.05)
Recall: 0.54 (+/- 0.02)
F1 Score: 0.49 (+/- 0.03)


In [22]:
tfidf_vectorizer_romney = TfidfVectorizer()

X_romney = tfidf_vectorizer_romney.fit_transform(romney_tweets)

clf_romney = MultinomialNB()
clf_romney.fit(X_romney, romney_class_train)

In [23]:
pipeline = make_pipeline(tfidf_vectorizer_romney, clf_romney)

# Perform 5-fold cross-validation
# Here 'obama_class_train' should be the labels for your training data
scores = cross_validate(pipeline, romney_tweets, romney_class_train, cv=5,
                        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
                        return_train_score=False)

# Print the scores
print("ROMNEY MODEL")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores['test_precision_macro'].mean(), scores['test_precision_macro'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores['test_recall_macro'].mean(), scores['test_recall_macro'].std() * 2))
print("F1 Score: %0.2f (+/- %0.2f)" % (scores['test_f1_macro'].mean(), scores['test_f1_macro'].std() * 2))

ROMNEY MODEL
Accuracy: 0.71 (+/- 0.01)
Precision: 0.65 (+/- 0.08)
Recall: 0.51 (+/- 0.01)
F1 Score: 0.45 (+/- 0.02)


In [24]:

# Load the testing data from Excel
testing_file_o = "/Users/danielabraham/Downloads/final-testData-no-label-tweets/final-testData-no-label-Obama-tweets.xlsx"
testing_file_r = "/Users/danielabraham/Downloads/final-testData-no-label-tweets/final-testData-no-label-Romney-tweets.xlsx"
df_obama_test = pd.read_excel(testing_file_o, sheet_name='Obama', usecols="A,B")
df_romney_test = pd.read_excel(testing_file_r, sheet_name='Romney', usecols="A,B")

# Rename columns
df_obama_test.columns = ['TID','RawTweet']
df_romney_test.columns = ['TID','RawTweet']

obama_tweets_raw_test = df_obama_test['RawTweet'].tolist()
romney_tweets_raw_test = df_romney_test['RawTweet'].tolist()

obama_test_tweets = pre_processing(obama_tweets_raw_test)
romney_test_tweets = pre_processing(romney_tweets_raw_test)




In [25]:
X_obama_test = tfidf_vectorizer_obama.transform(obama_test_tweets)
X_romney_test = tfidf_vectorizer_romney.transform(romney_test_tweets)

In [26]:

# Make predictions
romney_test_predictions = clf_romney.predict(X_romney_test)

# Combine predictions with tweet IDs
output_df = pd.DataFrame({
    'tweet_id': df_romney_test['TID'],
    'predicted_class': romney_test_predictions
})

# Write the DataFrame to a text file
output_df.to_csv('/Users/danielabraham/Desktop/romney_predictions.txt', sep=' ', index=False, header=False)

In [27]:

# Make predictions
obama_test_predictions = clf_obama.predict(X_obama_test)

# Combine predictions with tweet IDs
output_df = pd.DataFrame({
    'tweet_id': df_obama_test['TID'],
    'predicted_class': obama_test_predictions
})

# Write the DataFrame to a text file
output_df.to_csv('/Users/danielabraham/Desktop/obama_predictions.txt', sep=' ', index=False, header=False)

In [296]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

sc = {'positive': [], 'neutral': [], 'negative': []}

for tweet in obama_tweets:
    encoded_input = tokenizer(tweet, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]

    for i in range(scores.shape[0]):
        l = config.id2label[ranking[i]]
        s = scores[ranking[i]]
        sc[l].append(s)

Downloading config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [297]:
df_obama['positive'] = sc['positive']
df_obama['neutral'] = sc['neutral']
df_obama['negative'] = sc['negative']
df_obama.rename(columns={'positive': 'pos', 'neutral': 'neu', 'negative': 'neg'}, inplace=True)

In [298]:
preds = list()
for idx, row in df_obama.iterrows():
    if row['pos'] >= row['neu'] and row['pos'] >= row['neg']:
        preds.append(1)
    elif row['neg'] >= row['neu'] and row['neg'] > row['pos']:
        preds.append(-1)
    elif row['neu'] > row['pos'] and row['neu'] > row['neg']:
        preds.append(0)
df_obama['pred'] = preds

In [301]:
acc = accuracy_score(obama_class_train, df_obama['pred'])
prec = precision_score(obama_class_train, df_obama['pred'], average = None, zero_division = np.nan)
rec = recall_score(obama_class_train, df_obama['pred'], average = None)
f1 = f1_score(obama_class_train, df_obama['pred'], average = None)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)

Accuracy: 0.5436722983742429
Precision: [0.61886136 0.45776976 0.71826625]
Recall: [0.63713647 0.6979835  0.24986537]
F1: [0.62786596 0.55291341 0.37075509]
