## Import libraries

In [1]:
# data manipulation
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

# linear algebra
import numpy as np

# NLP
import re
import nltk
import spacy
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])

# progress bar
from tqdm import tqdm, tqdm_notebook
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

## Import dataset

In [2]:
df = pd.read_csv('train.csv',header=None,
                 names=['polarity','tweet_id','date','query','username','tweet'],
                 encoding='ISO-8859-1')

df.head()

Unnamed: 0,polarity,tweet_id,date,query,username,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
df.info(memory_usage=False)
print('\n')
print(f'Number of rows: {df.shape[0]}')
print(f'Number of columns: {df.shape[1]}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1600000 non-null  int64 
 1   tweet_id  1600000 non-null  int64 
 2   date      1600000 non-null  object
 3   query     1600000 non-null  object
 4   username  1600000 non-null  object
 5   tweet     1600000 non-null  object
dtypes: int64(2), object(4)

Number of rows: 1600000
Number of columns: 6


* This dataset consists of 1600000 tweets.
* There is no missing values.

## Understanding the data

In [4]:
df['polarity'].value_counts()

0    800000
4    800000
Name: polarity, dtype: int64

*polarity* has two values, 0 and 4, which represent negative and positive respectively. But let's change 4 to 1 because we always use 1 to represent positive.

In [5]:
df['polarity'] = df['polarity'].replace(4,1)

In [6]:
df['tweet_id'].nunique()

1598315

*tweet_id* should be unique. So, we should expect to get 1,600,000 unique *tweet_id*. Let's see if there are any tweets with the same *tweet_id*.

In [7]:
df.groupby('tweet_id').count().sort_values(by='tweet', ascending=False)

Unnamed: 0_level_0,polarity,date,query,username,tweet
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013708350,2,2,2,2,2
2014092464,2,2,2,2,2
1978925738,2,2,2,2,2
2169457968,2,2,2,2,2
1971498337,2,2,2,2,2
...,...,...,...,...,...
1974380475,1,1,1,1,1
1974380466,1,1,1,1,1
1974380370,1,1,1,1,1
1974380271,1,1,1,1,1


In [8]:
df[(df['tweet_id']==2013708350) | (df['tweet_id']==2014092464)].sort_values(by='tweet_id', ascending=False)

Unnamed: 0,polarity,tweet_id,date,query,username,tweet
336876,0,2014092464,Wed Jun 03 00:26:13 PDT 2009,NO_QUERY,reyj33,@meangirllissa thanks mel imma miss u guys ne...
1315945,1,2014092464,Wed Jun 03 00:26:13 PDT 2009,NO_QUERY,reyj33,@meangirllissa thanks mel imma miss u guys ne...
335084,0,2013708350,Tue Jun 02 23:21:44 PDT 2009,NO_QUERY,yukarikihihi,I just received my OFITG CD &amp; tote! no...
1312866,1,2013708350,Tue Jun 02 23:21:44 PDT 2009,NO_QUERY,yukarikihihi,I just received my OFITG CD &amp; tote! no...


Several tweets have the same *tweet_id* but different *polarity*. This is maybe due to error during data collection. We are just going to ignore these duplicated tweets.

In [9]:
df.drop_duplicates('tweet_id', keep=False, inplace=True, ignore_index=True)

In [10]:
df['tweet_id'].nunique() == len(df)

True

In [11]:
df['query'].value_counts()

NO_QUERY    1596630
Name: query, dtype: int64

* *date* is self-explanatory.
* *query* has only one value. It is meaningless to include it in our analysis.
* *username* is the sender of the tweet.

In [12]:
df = df.loc[:,['tweet','polarity']]

## Removal of Twitter handles (@username)
Username should be removed because it adds nothing to our analysis. Username starts with '@' and followed with alphanumeric characters (lower and upper case). *Underscore* character can also exist in username.

In [13]:
def username_removal(tweet):
    compiled = re.compile(r'@[A-Za-z0-9_]+')
    cleaned = re.sub(compiled,'',tweet)
    return cleaned

df['tweet'] = df['tweet'].apply(username_removal)

## Removal of websites
Website usually starts with http or https and ends with (dot)com.

In [14]:
def website_removal(tweet):
    compiled_1 = re.compile(r'http\S+')
    cleaned = re.sub(compiled_1,'',tweet)
    
    compiled_2 = re.compile(r'\S+.com')
    cleaned = re.sub(compiled_2,'',cleaned)
    return cleaned

df['tweet'] = df['tweet'].apply(website_removal)

## Removal of repeated characters
Some tweets might have word that has repeated characters. For example, *haaappppyyyy* or *hmmmm*. Let's limit those repeated characters to only two because the maximum number a character can be repeated in most of English words is only twice.

In [15]:
def repeated_chars_removal(tweet):
    compiled = re.compile(r'(.)\1+')
    cleaned = re.sub(compiled, r'\1\1', tweet)
    return cleaned

df['tweet'] = df['tweet'].apply(repeated_chars_removal)

## Removal of hashtags

In [16]:
def hashtag_removal(tweet):
    compiled = re.compile(r'#[A-Za-z0-9]+')
    cleaned = re.sub(compiled,'',tweet)
    return cleaned

df['tweet'] = df['tweet'].apply(hashtag_removal)

## Removal of characters except alphanumeric, apostrophes and exclamation marks

In [17]:
df['tweet'] = df['tweet'].str.replace(r"[^a-zA-Z0-9'!]",' ',regex=True)

## Removal of extra whitespaces and trailing whitespaces

In [18]:
def extra_whitespaces_removal(tweet):
    compiled = re.compile(r'\s{2,}')
    cleaned = re.sub(compiled, ' ',tweet)
    return cleaned.strip()

df['tweet'] = df['tweet'].apply(extra_whitespaces_removal)

In [19]:
blanks = []

for idx, tweet in enumerate(df['tweet']):
    if type(tweet) == str:
        if (tweet.isspace()) or (len(tweet.split()) == 0):
            blanks.append(idx)
    elif type(tweet) == float:
        blanks.append(idx)
            
len(blanks)

3781

In [20]:
df.drop(df.index[blanks],inplace=True)

In [21]:
df.to_csv('train_cleaned.csv',index=False)

## Normalization
Here we will be using lemmatization instead of stemming to normalize our data.

In [22]:
df["tweet"] = df['tweet'].progress_apply(lambda x: ' '.join([token.lemma_.lower() if token.lemma_ != '-PRON-' else token.lower_ for token in nlp(x)]))

  0%|          | 0/1592849 [00:00<?, ?it/s]

## Removal of 1-character words

In [23]:
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([tweet for tweet in x.split() if len(tweet) > 1]))

In [24]:
df['tweet'] = df['tweet'].apply(extra_whitespaces_removal)

blanks = []

for idx, tweet in enumerate(df['tweet']):
    if type(tweet) == str:
        if (tweet.isspace()) or (len(tweet.split()) == 0):
            blanks.append(idx)
    elif type(tweet) == float:
        blanks.append(idx)
            
len(blanks)

390

In [25]:
df.drop(df.index[blanks],inplace=True)

In [26]:
df.to_csv('train_lemma.csv',index=False)