In [1]:
import numpy as np
import pandas as pd

In [4]:
# loading the dataset

bigdf = pd.read_csv("bigasstweetsdataset.csv", encoding="latin-1")
bigdf.columns = ["target", "ids", "date", "flag", "user", "TweetText"]
bigdf.drop(['ids', 'flag', 'date', 'user'], axis=1, inplace=True) # since we are only detecting depressing sentiment

In [5]:
bigdf.head()

Unnamed: 0,target,TweetText
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [10]:
# label conversion and counting
print("The types of targets -> ", bigdf['target'].unique())
print("Given that 0 -> Negative Sentiment; 4 -> Positive Sentiment")
print("The counts are\n", bigdf['target'].value_counts(), sep="")
print("Pretty balanced dataset, lets convert labels to categories, readable and clear")

label_mapper = {4 : "Positive", 0 : "Negative"}
bigdf.target = bigdf.target.map(label_mapper)
del label_mapper

The types of targets ->  [0 4]
Given that 0 -> Negative Sentiment; 4 -> Positive Sentiment
The counts are
4    800000
0    799999
Name: target, dtype: int64
Pretty balanced dataset, lets convert labels to categories, readable and clear


In [11]:
bigdf.head()

Unnamed: 0,target,TweetText
0,Negative,is upset that he can't update his Facebook by ...
1,Negative,@Kenichan I dived many times for the ball. Man...
2,Negative,my whole body feels itchy and like its on fire
3,Negative,"@nationwideclass no, it's not behaving at all...."
4,Negative,@Kwesidei not the whole crew


In [15]:
# First, an we need to clean the dataset. 
# Lets find how many tweets of the 1.6mil contain a user mention or a link
mentioncounter = 0
linkcounter = 0

for tweet in bigdf['TweetText']:

    words = tweet.split()
    for word in words:
        if word.startswith('@'):
            mentioncounter += 1
        if word.startswith('http'):
            linkcounter += 1

print("Number of tweets containing user mention (@<something>):", mentioncounter)
print("Number of tweets containing links (http<something>):", linkcounter)

del mentioncounter
del linkcounter

Number of tweets containing user mention (@<something>): 793950
Number of tweets containing links (http<something>): 70941


Definitely need to clear them first. If there was image or emoji data, we would clear that as well, but this dataset is already cleaned to a degree, and does not contain those two

In [20]:
# NLTK is an excellent library for simple jobs here
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# SnowballStemmer is a class of stemmers based on Porter's Snowball language for stemming, and supports multiple languages
# A stemmer reduces words to a more base form, significantly contributing to slimming our corpus down
# It makes vectorizing the words easier, and gives more consistent results
