In [1]:
import numpy as np
import pandas as pd

In [2]:
# loading the dataset

bigdf = pd.read_csv("bigasstweetsdataset.csv", encoding="latin-1") # this is in .gitignore, its the sentiment140 dataset on kaggle
# linke -> https://www.kaggle.com/datasets/kazanova/sentiment140
bigdf.columns = ["target", "ids", "date", "flag", "user", "TweetText"]
bigdf.drop(['ids', 'flag', 'date', 'user'], axis=1, inplace=True) # since we are only detecting depressing sentiment

In [3]:
bigdf.head()

Unnamed: 0,target,TweetText
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [4]:
# label conversion and counting
print("The types of targets -> ", bigdf['target'].unique())
print("Given that 0 -> Negative Sentiment; 4 -> Positive Sentiment")
print("The counts are\n", bigdf['target'].value_counts(), sep="")
print("Pretty balanced dataset, lets convert labels to categories, readable and clear")

label_mapper = {4 : "Positive", 0 : "Negative"}
bigdf['target'] = bigdf['target'].map(label_mapper)
del label_mapper

The types of targets ->  [0 4]
Given that 0 -> Negative Sentiment; 4 -> Positive Sentiment
The counts are
4    800000
0    799999
Name: target, dtype: int64
Pretty balanced dataset, lets convert labels to categories, readable and clear


In [5]:
bigdf.head()

Unnamed: 0,target,TweetText
0,Negative,is upset that he can't update his Facebook by ...
1,Negative,@Kenichan I dived many times for the ball. Man...
2,Negative,my whole body feels itchy and like its on fire
3,Negative,"@nationwideclass no, it's not behaving at all...."
4,Negative,@Kwesidei not the whole crew


In [6]:
# First, an we need to clean the dataset. 
# Lets find how many tweets of the 1.6mil contain a user mention or a link
mentioncounter = 0
linkcounter = 0

for tweet in bigdf['TweetText']:

    words = tweet.split()
    for word in words:
        if word.startswith('@'):
            mentioncounter += 1
        if word.startswith('http'):
            linkcounter += 1

print("Number of tweets containing user mention (@<something>):", mentioncounter)
print("Number of tweets containing links (http<something>):", linkcounter)

del mentioncounter
del linkcounter

Number of tweets containing user mention (@<something>): 793950
Number of tweets containing links (http<something>): 70941


Definitely need to clear them first. If there was image or emoji data, we would clear that as well, but this dataset is already cleaned to a degree, and does not contain those two

In [7]:
# NLTK is an excellent library for simple jobs here
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import re

# SnowballStemmer is a class of stemmers based on Porter's Snowball language for stemming, and supports multiple languages
# it is derived from the original Porter stemmer, and features some imporvements
# eg stemming 'Sportingly' -> 'Sport' instead of 'Sportingly' -> 'Sportingli' like Porter would
# A stemmer reduces words to a more base form, significantly contributing to slimming our corpus down
# It makes vectorizing the words easier, and gives more consistent results

bigdf['Clean'] = bigdf['TweetText'].apply(lambda x: re.sub(r'[^\w\s]', '', x)) # simple removing of special chars (non word, whitespace)
bigdf['Clean'] = bigdf['Clean'].apply(lambda x: re.sub(r'http\S+://', '', x)) # https -> one or more whitespace -> ://
bigdf['Clean'] = bigdf['Clean'].apply(lambda x: str(x).lower().strip()) # Ensure we dont try to lowercase a full number tweet, followed by stripping whitespaces

In [8]:
# we temove stopwords by crossrefering if a given word is in the stopwords list or not
# then we stem the words down
# remeber to tokenize first ! otherwise 'for word in x' runs for every. single. letter.
# bear with the below, it takes about a few minutes to run

stpwrds = stopwords.words('english')
stemmer = SnowballStemmer('english')

# Tokenize
bigdf['Clean'] = bigdf['Clean'].apply(lambda x: word_tokenize(x)) # upwards of 2 mins on my pc

# Stopword removal
bigdf['Clean'] = bigdf['Clean'].apply(lambda x: [word for word in x if word not in stpwrds]) # upwards of 30s on my pc

# stemming
bigdf['Clean'] = bigdf['Clean'].apply(lambda x: [stemmer.stem(word) for word in x]) # around 2 min on my pc

# remember to rejoin! we dont want one-hot enconding or count vectorizing, but the whole string for word2vec
bigdf['Clean'] = bigdf['Clean'].apply(lambda x: ' '.join(x)) # around a second or so? idk

# next, if by any chance we have any random tweets with exact same strings remaining
bigdf = bigdf.drop_duplicates(subset='Clean') # instant



In [9]:
bigdf.head()

Unnamed: 0,target,TweetText,Clean
0,Negative,is upset that he can't update his Facebook by ...,upset cant updat facebook text might cri resul...
1,Negative,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save 50 res...
2,Negative,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
3,Negative,"@nationwideclass no, it's not behaving at all....",nationwideclass behav im mad cant see
4,Negative,@Kwesidei not the whole crew,kwesidei whole crew


In [10]:
# lets save this so we can quickly access the preprocessed data anytime
import zipfile

# Select the relevant columns
relevant_columns = ['target', 'Clean']
filtered_df = bigdf[relevant_columns]

# Save the dataset as CSV
filtered_df.to_csv('preprocessed_dataset.csv', index=False)

# Zip the CSV file
with zipfile.ZipFile('preprocessed_dataset.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('preprocessed_dataset.csv')

# i will be pushing both, zipped and unzipped to the repo
# just in case.

Next steps is model training, so general outline is 

    train-test split

    encoding

    word embeddings (word2vec, text2vec-openai, etc)

    model layering (make your pick from keras, pytorch, etc)

    optimization 

    evaluation

And for the project we also have a flask deployment concept