In [1]:
import pandas as pd
import re

RANDOM_SEED = 69

0	negative
1	neutral
2	positive

In [16]:
# File paths
train_text_path = 'Datasets/sem_eval_2018/train_text.txt'
train_labels_path = 'Datasets/sem_eval_2018/train_labels.txt'

val_text_path = 'Datasets/sem_eval_2018/val_text.txt'
val_labels_path = 'Datasets/sem_eval_2018/val_labels.txt'

test_text_path = 'Datasets/sem_eval_2018/test_text.txt'
test_labels_path = 'Datasets/sem_eval_2018/test_labels.txt'

In [17]:
with open(train_labels_path, "r", encoding="utf-8") as f:
    train_labels = [int(line.strip()) for line in f]

with open(train_text_path, "r", encoding="utf-8") as f:
    train_texts = [line.strip() for line in f]

assert len(train_labels) == len(train_texts), "Mismatch between labels and texts!"

train_df = pd.DataFrame({"text": train_texts, "label": train_labels})

print(train_df.shape)
train_df

(45615, 2)


Unnamed: 0,text,label
0,"""QT @user In the original draft of the 7th boo...",2
1,"""Ben Smith / Smith (concussion) remains out of...",1
2,Sorry bout the stream last night I crashed out...,1
3,Chase Headley's RBI double in the 8th inning o...,1
4,@user Alciato: Bee will invest 150 million in ...,2
...,...,...
45610,"@user \""""So amazing to have the beautiful Lady...",2
45611,"9 September has arrived, which means Apple's n...",2
45612,Leeds 1-1 Sheff Wed. Giuseppe Bellusci securin...,2
45613,@user no I'm in hilton head till the 8th lol g...,1


In [18]:
with open(val_labels_path, "r", encoding="utf-8") as f:
    val_labels = [int(line.strip()) for line in f]

with open(val_text_path, "r", encoding="utf-8") as f:
    val_texts = [line.strip() for line in f]

assert len(val_labels) == len(val_texts), "Mismatch between labels and texts!"

val_df = pd.DataFrame({"text": val_texts, "label": val_labels})

print(val_df.shape)
val_df

(2000, 2)


Unnamed: 0,text,label
0,Dark Souls 3 April Launch Date Confirmed With ...,1
1,"""National hot dog day, national tequila day, t...",2
2,When girls become bandwagon fans of the Packer...,0
3,@user I may or may not have searched it up on ...,1
4,Here's your starting TUESDAY MORNING Line up a...,1
...,...,...
1995,"""LONDON (AP) """" Prince George celebrates his s...",1
1996,Harper's Worst Offense against Refugees may be...,1
1997,Hold on... Sam Smith may do the theme to Spect...,2
1998,Gonna watch Final Destination 5 tonight. I alw...,1


In [19]:
with open(test_labels_path, "r", encoding="utf-8") as f:
    test_labels = [int(line.strip()) for line in f]

with open(test_text_path, "r", encoding="utf-8") as f:
    test_texts = [line.strip() for line in f]

assert len(test_labels) == len(test_texts), "Mismatch between labels and texts!"

test_df = pd.DataFrame({"text": test_texts, "label": test_labels})

print(test_df.shape)
test_df

(12284, 2)


Unnamed: 0,text,label
0,@user @user what do these '1/2 naked pics' hav...,1
1,OH: “I had a blue penis while I was this” [pla...,1
2,"@user @user That's coming, but I think the vic...",1
3,I think I may be finally in with the in crowd ...,2
4,"@user Wow,first Hugo Chavez and now Fidel Cast...",0
...,...,...
12279,Sentinel Editorial: FBI’s Comey ‘had no one of...,1
12280,perfect pussy clips #vanessa hudgens zac efron...,1
12281,#latestnews 4 #newmexico #politics + #nativeam...,1
12282,Trying to have a conversation with my dad abou...,0


In [20]:
def clean_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)  # Remove URLs
    tweet = re.sub(r'@[A-Za-z0-9_]+', '', tweet)  # Remove @ mentions
    tweet = re.sub(r'#[A-Za-z0-9_]+', '', tweet)  # Remove hashtags
    tweet = re.sub(r'[^A-Za-z\s.,!?;:\'\"-]', '', tweet)  # Remove non-alphabetic characters
    tweet = tweet.lower().strip()  # Lowercase and remove leading/trailing spaces
    return tweet

In [21]:
train_df["text"] = train_df["text"].apply(clean_tweet)
val_df["text"] = val_df["text"].apply(clean_tweet)
test_df["text"] = test_df["text"].apply(clean_tweet)

In [22]:
# Shape of each dataset before dropping NA
print("Shape of each dataset before dropping NA:")
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

# Drop na for each dataset
train_df = train_df.dropna(subset=['text'])
val_df = val_df.dropna(subset=['text'])
test_df = test_df.dropna(subset=['text'])

# Drop blank
train_df = train_df[train_df['text'] != ""]
val_df = val_df[val_df['text'] != ""]
test_df = test_df[test_df['text'] != ""]

# Reset index
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Shape of each dataset after dropping NA
print("\nShape of each dataset after dropping NA:")
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)


Shape of each dataset before dropping NA:
(45615, 2)
(2000, 2)
(12284, 2)

Shape of each dataset after dropping NA:
(45614, 2)
(2000, 2)
(12282, 2)


In [23]:
train_output_csv = 'Datasets/sem_eval_2018/sem_eval_2018_train.csv'
val_output_csv = 'Datasets/sem_eval_2018/sem_eval_2018_val.csv'
test_output_csv = 'Datasets/sem_eval_2018/sem_eval_2018_test.csv'

train_df.to_csv(train_output_csv, index=False, encoding="utf-8")
val_df.to_csv(val_output_csv, index=False, encoding="utf-8")
test_df.to_csv(test_output_csv, index=False, encoding="utf-8")

In [15]:
test_df = pd.read_csv('Datasets/sem_eval_2018/sem_eval_2018_test.csv')
test_df

Unnamed: 0,text,label
0,what do these ' naked pics' have to do with an...,1
1,oh: i had a blue penis while i was this playin...,1
2,"that's coming, but i think the victims are goi...",1
3,i think i may be finally in with the in crowd,2
4,"wow,first hugo chavez and now fidel castro. da...",0
...,...,...
12277,sentinel editorial: fbis comey had no one of m...,1
12278,perfect pussy clips hudgens zac efron naked,1
12279,- protesting rise of alt-right at...,1
12280,trying to have a conversation with my dad abou...,0


In [16]:
# Remove neutral tweets
test_df = test_df[test_df['label'] != 1]
test_df = test_df.reset_index(drop=True)
print('Shape of test df:', test_df.shape)
print('Value Counts :', test_df['label'].value_counts())
test_df

Shape of test df: (6347, 2)
Value Counts : label
0    3972
2    2375
Name: count, dtype: int64


Unnamed: 0,text,label
0,i think i may be finally in with the in crowd,2
1,"wow,first hugo chavez and now fidel castro. da...",0
2,twitter's shows heartfelt gratitude to potus,2
3,take away illegals and dead people and trump w...,0
4,cute little dance,2
...,...,...
6342,for al the crying you do about how middle amer...,0
6343,"i'm not even catholic, but pope francis is my ...",2
6344,"looks like flynn isn't too pleased with me, he...",0
6345,trying to have a conversation with my dad abou...,0


In [17]:
# Map negative to 0 and positive to 1
test_df['label'] = test_df['label'].map({0: 0, 2: 1})
test_df

Unnamed: 0,text,label
0,i think i may be finally in with the in crowd,1
1,"wow,first hugo chavez and now fidel castro. da...",0
2,twitter's shows heartfelt gratitude to potus,1
3,take away illegals and dead people and trump w...,0
4,cute little dance,1
...,...,...
6342,for al the crying you do about how middle amer...,0
6343,"i'm not even catholic, but pope francis is my ...",1
6344,"looks like flynn isn't too pleased with me, he...",0
6345,trying to have a conversation with my dad abou...,0


In [18]:
test_df.to_csv('Datasets/sem_eval_2018/sem_eval_2018_test_binary.csv', index=False, encoding="utf-8")