In [11]:
import pandas as pd
import re
import string
from sentence_transformers import SentenceTransformer
import pickle
import os

In [12]:
data_path='test_data/tweets.xlsx'
# Read both sheets from the Excel file
df_sheet1 = pd.read_excel("test_data/final_Obama.xlsx", sheet_name=0, header=None)
df_sheet2 = pd.read_excel("test_data/final_Romney.xlsx", sheet_name=0, header=None)

count_obama = len(df_sheet1)

# Combine the dataframes
df = pd.concat([df_sheet1, df_sheet2], ignore_index=True)
df.columns = ['id', 'tweet']
print(df.shape)
df.head(10)

(3851, 2)


Unnamed: 0,id,tweet
0,1,<e>Obama</e> has to maintain his professionali...
1,2,<e>Obama</e> went into the debate swinging and...
2,3,Ditto. I started @247LS 4 years ago. RT @bmorr...
3,4,I absolutely love <e>Obama</e>'s view in <a>im...
4,5,I'm agreeing completely with <e>Obama</e>'s st...
5,6,<e>Obama</e>'s <a>smile</a> makes me happy.
6,7,Hahahahahaahahha<e> Obama</e>'s <a>rebuttal</a...
7,8,If you think the economy has gotten worse duri...
8,9,<e>Obama</e>'s <a>debate performance</a> tonig...
9,10,I like the fact that it's not within <e>Obama<...


In [13]:
df['tweet'] = df['tweet'].astype(str).fillna('')

# Remove HTML tags, mentions, URLs, &-starting strings, specific characters, punctuation
df['tweet'] = df['tweet'].str.replace(r'<[^>]*>', '', regex=True)  # Remove HTML tags
df['tweet'] = df['tweet'].str.replace(r'@\w+', '', regex=True)  # Remove mentions starting with @
df['tweet'] = df['tweet'].str.replace(r'http\S+', '', regex=True)  # Remove URLs starting with http
df['tweet'] = df['tweet'].str.replace(r'&\S+', '', regex=True)  # Remove strings starting with &
df['tweet'] = df['tweet'].str.replace(r'"', '', regex=True)  # Remove double quotes
df['tweet'] = df['tweet'].str.replace(r'#', '', regex=True)  # Remove hash symbols

# Remove non-ASCII characters
df['tweet'] = df['tweet'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii') if isinstance(x, str) else x)

# Remove punctuation
df['tweet'] = df['tweet'].str.replace(f"[{re.escape(string.punctuation)}]", '', regex=True)

# Remove extra spaces that may result from removals
df['tweet'] = df['tweet'].str.strip().replace(r'\s+', ' ', regex=True)

print(df.shape)
df.head(10)

(3851, 2)


Unnamed: 0,id,tweet
0,1,Obama has to maintain his professionalism thro...
1,2,Obama went into the debate swinging and came o...
2,3,Ditto I started 4 years ago RT I work for a sm...
3,4,I absolutely love Obamas view in immigration h...
4,5,Im agreeing completely with Obamas stance on i...
5,6,Obamas smile makes me happy
6,7,Hahahahahaahahha Obamas rebuttal got actual cr...
7,8,If you think the economy has gotten worse duri...
8,9,Obamas debate performance tonight about 100000...
9,10,I like the fact that its not within Obamas com...


In [14]:
tweet_vector = df['tweet'].to_numpy()
tweet_vector.shape

(3851,)

In [15]:
model = SentenceTransformer("all-mpnet-base-v2")
train_embeddings = model.encode(tweet_vector)
train_embeddings.shape

(3851, 768)

In [16]:
with open('models/SVM.pkl', 'rb') as file:
    svm_model = pickle.load(file)

predictions = svm_model.predict(train_embeddings)
final_results = predictions - 1

final_results.shape

(3851,)

In [17]:
os.makedirs('results', exist_ok=True)

count = 1

with open('results/DavideEttori_AngeloZangari.txt', 'w') as complete_file:
    complete_file.write("(setf x ‘(\n")

with open('results/DavideEttori_AngeloZangari.txt', 'a') as complete_file:
    for label in final_results:
        complete_file.write(f"({count} {label})\n")
        count += 1

with open('results/DavideEttori_AngeloZangari.txt', 'a') as complete_file:
    complete_file.write("))")

In [18]:
os.makedirs('results', exist_ok=True)

count = 1

with open('results/Obama.txt', 'w') as obama_file:
    obama_file.write("(setf x ‘(\n")

with open('results/Obama.txt', 'a') as obama_file:
    for label in final_results[:count_obama]:
        obama_file.write(f"({count} {label})\n")
        count += 1

with open('results/Obama.txt', 'a') as obama_file:
    obama_file.write("))")

count = 1

with open('results/Romney.txt', 'w') as romney_file:
    romney_file.write("(setf x ‘(\n")

with open('results/Romney.txt', 'a') as romney_file:
    for label in final_results[count_obama:]:
        romney_file.write(f"({count} {label})\n")
        count += 1

with open('results/Romney.txt', 'a') as romney_file:
    romney_file.write("))")