In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
train_data = pd.read_csv("./data/semeval-2017-train.csv", sep='\t')
valid_data = pd.read_csv("./data/semeval-2017-test.csv", sep='\t')

In [3]:
train_data.head()

Unnamed: 0,label,text
0,1,One Night like In Vegas I make dat Nigga Famous
1,1,Walking through Chelsea at this time of day is...
2,0,"And on the very first play of the night, Aaron..."
3,0,"Drove the bike today, about 40 miles. Felt lik..."
4,-1,looking at the temp outside....hpw did it get ...


In [4]:
valid_data.head()

Unnamed: 0,label,text
0,0,Trump is building a wall on the Mexican border...
1,-1,@lasinferencias & the WALL Trump wants to buil...
2,-1,President Elect? More like President Erect! A ...
3,0,"Ok, I know a lot of you think a wall on the Me..."
4,0,The Great Mexican Wall Deception: Trump's Amer...


In [5]:
train_data.isnull().sum()

label    0
text     0
dtype: int64

In [6]:
valid_data.isnull().sum()

label    0
text     0
dtype: int64

In [7]:
def remove_noise(text):
    text = re.sub(r'^rt\s+', '', text, flags=re.IGNORECASE) # Remove 'RT' at the beginning of the tweet
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions (@username)
    text = re.sub(r'#', '', text)  # Remove hashtags but keep the word (e.g., #happy -> happy)
    text = re.sub(r'[^\x00-\x7F]+', '', text) # Remove emojis and non-ASCII characters
    return text

train_data['cleaned_text'] = train_data['text'].apply(remove_noise)
valid_data['cleaned_text'] = valid_data['text'].apply(remove_noise)

In [8]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    #text = text.lower()  # Convert to lowercase
    return text

train_data['cleaned_text'] = train_data['cleaned_text'].apply(clean_text)
valid_data['cleaned_text'] = valid_data['cleaned_text'].apply(clean_text)

In [9]:
label_counts = train_data['label'].value_counts()
label_counts

label
 0    22175
 1    19620
-1     7701
Name: count, dtype: int64

In [10]:
label_counts = valid_data['label'].value_counts()
label_counts

label
 0    5787
-1    3857
 1    2333
Name: count, dtype: int64

In [11]:
train_data['cleaned_text'] = train_data['cleaned_text'].apply(word_tokenize)
valid_data['cleaned_text'] = valid_data['cleaned_text'].apply(word_tokenize)

In [12]:
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

# Apply stopword removal
train_data['cleaned_text'] = train_data['cleaned_text'].apply(remove_stopwords)
valid_data['cleaned_text'] = valid_data['cleaned_text'].apply(remove_stopwords)

In [13]:
train_data.head()

Unnamed: 0,label,text,cleaned_text
0,1,One Night like In Vegas I make dat Nigga Famous,"[One, Night, like, Vegas, make, dat, Nigga, Fa..."
1,1,Walking through Chelsea at this time of day is...,"[Walking, Chelsea, time, day, rather, lovely, ..."
2,0,"And on the very first play of the night, Aaron...","[first, play, night, Aaron, Rodgers, INT, UDFA..."
3,0,"Drove the bike today, about 40 miles. Felt lik...","[Drove, bike, today, 40, miles, Felt, like, Ji..."
4,-1,looking at the temp outside....hpw did it get ...,"[looking, temp, outside, hpw, get, hotter, sun..."


In [14]:
valid_data.head()

Unnamed: 0,label,text,cleaned_text
0,0,Trump is building a wall on the Mexican border...,"[Trump, building, wall, Mexican, border, stop,..."
1,-1,@lasinferencias & the WALL Trump wants to buil...,"[WALL, Trump, wants, build, researched, would,..."
2,-1,President Elect? More like President Erect! A ...,"[President, Elect, like, President, Erect, wal..."
3,0,"Ok, I know a lot of you think a wall on the Me...","[Ok, know, lot, think, wall, Mexican, border, ..."
4,0,The Great Mexican Wall Deception: Trump's Amer...,"[Great, Mexican, Wall, Deception, Trump, Ameri..."


In [15]:
pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they']
def remove_all_names(tokens):
    # Filter out words where all characters are uppercase
    return [token.lower() for i,token in enumerate(tokens) if not (token[0].isupper() and (i==0) and token.lower() in pronouns)]
    
# Apply to the tokens column
train_data['cleaned_text'] = train_data['cleaned_text'].apply(remove_all_names)
valid_data['cleaned_text'] = valid_data['cleaned_text'].apply(remove_all_names)

In [16]:
train_data.iloc[1]

label                                                           1
text            Walking through Chelsea at this time of day is...
cleaned_text    [walking, chelsea, time, day, rather, lovely, ...
Name: 1, dtype: object

In [17]:
def lower(tokens):
    return [token.lower() for token in tokens]
    
train_data['cleaned_text'] = train_data['cleaned_text'].apply(lower)
valid_data['cleaned_text'] = valid_data['cleaned_text'].apply(lower)
train_data['text'] = train_data['text'].apply(lambda x: x.lower())
valid_data['text'] = valid_data['text'].apply(lambda x: x.lower())

In [18]:
train_data.head()

Unnamed: 0,label,text,cleaned_text
0,1,one night like in vegas i make dat nigga famous,"[one, night, like, vegas, make, dat, nigga, fa..."
1,1,walking through chelsea at this time of day is...,"[walking, chelsea, time, day, rather, lovely, ..."
2,0,"and on the very first play of the night, aaron...","[first, play, night, aaron, rodgers, int, udfa..."
3,0,"drove the bike today, about 40 miles. felt lik...","[drove, bike, today, 40, miles, felt, like, ji..."
4,-1,looking at the temp outside....hpw did it get ...,"[looking, temp, outside, hpw, get, hotter, sun..."


In [19]:
valid_data.head()

Unnamed: 0,label,text,cleaned_text
0,0,trump is building a wall on the mexican border...,"[trump, building, wall, mexican, border, stop,..."
1,-1,@lasinferencias & the wall trump wants to buil...,"[wall, trump, wants, build, researched, would,..."
2,-1,president elect? more like president erect! a ...,"[president, elect, like, president, erect, wal..."
3,0,"ok, i know a lot of you think a wall on the me...","[ok, know, lot, think, wall, mexican, border, ..."
4,0,the great mexican wall deception: trump's amer...,"[great, mexican, wall, deception, trump, ameri..."


In [20]:
duplicates = train_data[train_data.duplicated(subset=['text'], keep=False)] 
train_data = train_data.drop_duplicates(subset=['text'], keep='first') 

duplicates = valid_data[valid_data.duplicated(subset=['text'], keep=False)] 
valid_data = valid_data.drop_duplicates(subset=['text'], keep='first') 

In [21]:
train_data['cat_label'] = pd.Categorical(train_data['label'])
train_data['label'] = train_data['cat_label'].cat.codes 

valid_data['cat_label'] = pd.Categorical(valid_data['label'])
valid_data['label'] = valid_data['cat_label'].cat.codes

In [22]:
train_data['tokens'] = train_data['cleaned_text']
valid_data['tokens'] = valid_data['cleaned_text']

In [23]:
train_data['cleaned_text'] = train_data['cleaned_text'].apply(lambda x: ' '.join(x))
valid_data['cleaned_text'] = valid_data['cleaned_text'].apply(lambda x: ' '.join(x))

In [24]:
train_data.to_csv('./data/train_data.csv', index=False)
valid_data.to_csv('./data/valid_data.csv', index=False)

In [25]:
train_data

Unnamed: 0,label,text,cleaned_text,cat_label,tokens
0,2,one night like in vegas i make dat nigga famous,one night like vegas make dat nigga famous,1,"[one, night, like, vegas, make, dat, nigga, fa..."
1,2,walking through chelsea at this time of day is...,walking chelsea time day rather lovely love lo...,1,"[walking, chelsea, time, day, rather, lovely, ..."
2,1,"and on the very first play of the night, aaron...",first play night aaron rodgers int udfa cb bra...,0,"[first, play, night, aaron, rodgers, int, udfa..."
3,1,"drove the bike today, about 40 miles. felt lik...",drove bike today 40 miles felt like jim carrey...,0,"[drove, bike, today, 40, miles, felt, like, ji..."
4,0,looking at the temp outside....hpw did it get ...,looking temp outside hpw get hotter sun goes f...,-1,"[looking, temp, outside, hpw, get, hotter, sun..."
...,...,...,...,...,...
49491,2,today *very* rare day when democrats will get ...,today rare day democrats get healthier club gr...,1,"[today, rare, day, democrats, get, healthier, ..."
49492,2,rt @steventdennis: today *very* rare day when ...,today rare day democrats get healthier club gr...,1,"[today, rare, day, democrats, get, healthier, ..."
49493,0,@charliemax democrats will quickly implode and...,democrats quickly implode concede obama either...,-1,"[democrats, quickly, implode, concede, obama, ..."
49494,0,once again democrats spent all night and this ...,democrats spent night morning trying talk stoc...,-1,"[democrats, spent, night, morning, trying, tal..."
