In [8]:
import pandas as pd
import re
import string
from sentence_transformers import SentenceTransformer
import pickle
import os

In [9]:
data_path='test_data/tweets.xlsx'
# Read both sheets from the Excel file
df_sheet1 = pd.read_excel(data_path, sheet_name=0, header=None)
df_sheet2 = pd.read_excel(data_path, sheet_name=1, header=None)

count_obama = len(df_sheet1)

# Combine the dataframes
df = pd.concat([df_sheet1, df_sheet2], ignore_index=True)
df.columns = ['tweet']
print(df.shape)
df.head(10)

(9, 1)


Unnamed: 0,tweet
0,"Kirkpatrick, who wore a baseball cap embroider..."
1,Question: If <e>Romney</e> and <e>Obama</e> ha...
2,#<e>obama</e> debates that Cracker Ass Cracker...
3,RT @davewiner Slate: Blame <e>Obama</e> for fo...
4,@Hollivan @hereistheanswer Youre missing the ...
5,Insidious!<e>Mitt Romney</e>'s Bain Helped Phi...
6,Senior <e>Romney</e> Advisor Claims <e>Obama</...
7,.@WardBrenda @shortwave8669 @allanbourdius you...
8,<e>Mitt Romney</e> still doesn't <a>believe</a...


In [10]:
df['tweet'] = df['tweet'].astype(str).fillna('')

# Remove HTML tags, mentions, URLs, &-starting strings, specific characters, punctuation
df['tweet'] = df['tweet'].str.replace(r'<[^>]*>', '', regex=True)  # Remove HTML tags
df['tweet'] = df['tweet'].str.replace(r'@\w+', '', regex=True)  # Remove mentions starting with @
df['tweet'] = df['tweet'].str.replace(r'http\S+', '', regex=True)  # Remove URLs starting with http
df['tweet'] = df['tweet'].str.replace(r'&\S+', '', regex=True)  # Remove strings starting with &
df['tweet'] = df['tweet'].str.replace(r'"', '', regex=True)  # Remove double quotes
df['tweet'] = df['tweet'].str.replace(r'#', '', regex=True)  # Remove hash symbols

# Remove non-ASCII characters
df['tweet'] = df['tweet'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii') if isinstance(x, str) else x)

# Remove punctuation
df['tweet'] = df['tweet'].str.replace(f"[{re.escape(string.punctuation)}]", '', regex=True)

# Remove extra spaces that may result from removals
df['tweet'] = df['tweet'].str.strip().replace(r'\s+', ' ', regex=True)

print(df.shape)
df.head(10)

(9, 1)


Unnamed: 0,tweet
0,Kirkpatrick who wore a baseball cap embroidere...
1,Question If Romney and Obama had a childpunchi...
2,obama debates that Cracker Ass Cracker tonight...
3,RT Slate Blame Obama for four deaths in Libya ...
4,Youre missing the point Im afraid you do not u...
5,InsidiousMitt Romneys Bain Helped Philip Morri...
6,Senior Romney Advisor Claims Obama Administrat...
7,you mean like romney cheated in primary
8,Mitt Romney still doesnt believe that we have ...


In [11]:
tweet_vector = df['tweet'].to_numpy()
tweet_vector.shape

(9,)

In [12]:
model = SentenceTransformer("all-mpnet-base-v2")
train_embeddings = model.encode(tweet_vector)
train_embeddings.shape

(9, 768)

In [13]:
with open('models/SVM.pkl', 'rb') as file:
    svm_model = pickle.load(file)

predictions = svm_model.predict(train_embeddings)
final_results = predictions - 1

final_results.shape

(9,)

In [14]:
os.makedirs('results', exist_ok=True)

with open('results/obama.txt', 'w') as obama_file:
    for label in final_results[:count_obama]:
        obama_file.write(f"{label}\n")

with open('results/romney.txt', 'w') as romney_file:
    for label in final_results[count_obama:]:
        romney_file.write(f"{label}\n")