## Liberaries

In [53]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

## Loading the dataset

In [15]:
df = pd.read_csv('./training.1600000.processed.noemoticon.csv', header=None, encoding='latin')

df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Preprocessing & Data Cleaning

1. Rename the required columns

In [16]:
df.rename(columns={
    0:'Sentiment',
    5:'Tweets'
}, inplace=True)

df.head()

Unnamed: 0,Sentiment,1,2,3,4,Tweets
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


2. Remove unnessery columns

In [17]:
df.drop([1,2,3,4], axis=1, inplace=True)

df.head()

Unnamed: 0,Sentiment,Tweets
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


3. Replace 4 with 1 that reflects Positive sentiment

In [19]:
df['Sentiment'].unique()

array([0, 4], dtype=int64)

In [20]:
df['Sentiment'][df['Sentiment'] == 4] = 1

df['Sentiment'].unique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment'][df['Sentiment'] == 4] = 1


array([0, 1], dtype=int64)

4. Getting a portion of the dataset as the whole dataset is huge

In [22]:
df = df.sample(frac=1).reset_index(drop=True)

df = df[:10000]

df.head()

Unnamed: 0,Sentiment,Tweets
0,1,@surferdrew missed you this morning.
1,1,Going to the lake with Corey today. Sunscreen ...
2,0,my meggg she looks like she's seen a ghost. i...
3,0,They took my beer away from me at work lol ht...
4,1,is so enjoying Eminem right now. Hahaha.


In [23]:
df.shape

(10000, 2)

In [25]:
df['Sentiment'].value_counts()

Sentiment
1    5029
0    4971
Name: count, dtype: int64

5. Data Cleaning ***

In [29]:
tweets = df['Tweets'].values

tweets

array(['@surferdrew missed you this morning.  ',
       'Going to the lake with Corey today. Sunscreen is a must. ',
       "my meggg  she looks like she's seen a ghost. i feel bad, i feel like she's really lonely",
       ..., 'meu twitterfox morreu ',
       "@MeganLawler Sounds good! Let Meg know  I know she's never on Twitter.",
       "@masonpants You're probably right there. It looks delicious though. Haven't eaten today as @mrskennett isn't here to feed me "],
      dtype=object)

In [54]:
cleaned_tweets = []
lemmatization = WordNetLemmatizer()

for tweet in tqdm(tweets):
    # convert into lower case
    tweet = tweet.lower()
    # remove links
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    # remove @
    tweet = re.sub(r'@\S+', '', tweet)
    # remove special characters
    tweet = re.sub(r'[^\w\s#]', '', tweet)
    # lemmatize each word
    tweet = [lemmatization.lemmatize(word) for word in word_tokenize(tweet)]
    # remove stop words
    tweet = [word for word in tweet if word not in stopwords.words('english')]

    # processed tweet
    processed_tweet = ' '.join(tweet)

    cleaned_tweets.append(processed_tweet)

cleaned_tweets[:50]

100%|██████████| 10000/10000 [00:26<00:00, 383.18it/s]


['missed morning',
 'going lake corey today sunscreen must',
 'meggg look like shes seen ghost feel bad feel like shes really lonely',
 'took beer away work lol',
 'enjoying eminem right hahaha',
 'anyone wanted attend tedmed cant make date ill happily take place liveblog',
 'okay im awake house coherent morning',
 'ive got idea reach birthday brunch location road around west jkt clueless',
 'u complaining young bucc get blaccberry sheesh iphones rrr eehh',
 'got booed thats crazy wa thatd awful like',
 'sup fellow twit got home work didnt get tat guess get sometime later summer good one',
 'ðððºð¾ð²ð ðñðñð¾ðð ñððµññðºðñ',
 'queue supermarket huuuuge',
 'assignment college boring',
 'stupid may sound cried writing goodbye gift ill probably cry next friday final leaving school',
 'bad mobileconfigs cant add mm well',
 'ten inch hero popped mind better say thanks',
 'answer isnt always full flash site hope nrks work api open exciting possibility',
 'time sleeporama day gradualy looked bi

6. Add two new columns

In [55]:
def word_count(text):
    return len(text.split())

def character_count(text):
    return len(text)

df['Cleaned_Tweets'] = cleaned_tweets
df['Word_Count'] = df['Cleaned_Tweets'].apply(word_count)
df['Character_Count'] = df['Cleaned_Tweets'].apply(character_count)

df.head()

Unnamed: 0,Sentiment,Tweets,Cleaned_Tweets,Word_Count,Character_Count
0,1,@surferdrew missed you this morning.,missed morning,2,14
1,1,Going to the lake with Corey today. Sunscreen ...,going lake corey today sunscreen must,6,37
2,0,my meggg she looks like she's seen a ghost. i...,meggg look like shes seen ghost feel bad feel ...,13,69
3,0,They took my beer away from me at work lol ht...,took beer away work lol,5,23
4,1,is so enjoying Eminem right now. Hahaha.,enjoying eminem right hahaha,4,28
