In [1]:
import pandas as pd
import re

In [2]:
# read 500K tweets 50/50 positive-negative (most to speed up training)
tweets = pd.read_csv("../data/tweets.csv", header=None).groupby(0).sample(250000, random_state=1)
tweets.rename(columns={0:"sentiment", 1:"id", 2:"date", 4:"user", 5:"tweet"}, inplace=True)
tweets.drop(3, axis=1, inplace=True)

In [3]:
pd.set_option("display.max_colwidth", 1000)
tweets.head()

Unnamed: 0,sentiment,id,date,user,tweet
670935,0,2246713398,Fri Jun 19 18:00:49 PDT 2009,Mickie_1,"@TheRealScarab PA system bugs are a bummer, sorry."
649589,0,2237331866,Fri Jun 19 05:36:56 PDT 2009,supaflya,"oh daaamnnn! the firemen ball's on the 14th and i'll still be in Cannes So many handsome men united 2gether, and i miss it! *out tonite*"
2028,0,1468288564,Tue Apr 07 00:46:03 PDT 2009,in_wonderland,"I don't want to be cold in April, but I am"
669856,0,2246385590,Fri Jun 19 17:31:05 PDT 2009,Jessimikaaaa,@JaredOngie haha its too cold down here bt other than that im quite fine.. jst extremely bored wht are your plans for the day?
458897,0,2072038618,Sun Jun 07 20:17:45 PDT 2009,MacAddict0915,Upset I can't find my CHI!


In [4]:
tweets["sentiment"].value_counts()

0    250000
4    250000
Name: sentiment, dtype: int64

---
# Pre-Processing

In [5]:
def remove_usernames(tweet):
    return re.sub("@[a-zA-Z0-9]+\s+", "", tweet)

def remove_whitespaces(tweet):
    return re.sub("\s+", " ", tweet)

def remove_links(tweet):
    return re.sub("http\S+", "", tweet)

def remove_specific_patterns(tweet):
    return re.sub("[^a-zA-Z0-9\'\.!%#?\s]", "", tweet)

def cleaning_tweet(tweet):
    return remove_whitespaces(remove_specific_patterns(remove_links(remove_usernames(tweet))))

In [6]:
tweets["tweet"] = tweets["tweet"].apply(cleaning_tweet).str.lower()
tweets["sentiment"] = tweets["sentiment"].map({0:0, 4:1})

In [7]:
tweets.head()

Unnamed: 0,sentiment,id,date,user,tweet
670935,0,2246713398,Fri Jun 19 18:00:49 PDT 2009,Mickie_1,pa system bugs are a bummer sorry.
649589,0,2237331866,Fri Jun 19 05:36:56 PDT 2009,supaflya,oh daaamnnn! the firemen ball's on the 14th and i'll still be in cannes so many handsome men united 2gether and i miss it! out tonite
2028,0,1468288564,Tue Apr 07 00:46:03 PDT 2009,in_wonderland,i don't want to be cold in april but i am
669856,0,2246385590,Fri Jun 19 17:31:05 PDT 2009,Jessimikaaaa,haha its too cold down here bt other than that im quite fine.. jst extremely bored wht are your plans for the day?
458897,0,2072038618,Sun Jun 07 20:17:45 PDT 2009,MacAddict0915,upset i can't find my chi!


In [8]:
print(f"Unique tokens: {len(set(' '.join(tweets['tweet'].tolist()).split()))}")

Unique tokens: 317598


In [9]:
print(f"Longest tweet: {tweets['tweet'].apply(lambda x: len(x.split())).max()} words")
print(f"Mean words per tweet: {tweets['tweet'].apply(lambda x: len(x.split())).mean()}")
print(f"Median words per tweet: {tweets['tweet'].apply(lambda x: len(x.split())).median()}")

Longest tweet: 58 words
Mean words per tweet: 12.607194
Median words per tweet: 12.0


In [10]:
tweets.to_parquet("../data/pre-processed-tweets.parquet")