In [1]:
CUSTOMISED_STOP_WORDS = set(
    """
    a about above across after afterwards again all almost alone along
    already also although always am among amongst amount an and another any anyhow
    anyone anything anyway anywhere are around as at
    back be became because become becomes becoming been before beforehand behind
    being below beside besides between beyond both bottom but by
    call can cannot ca could
    did do does doing done down due during
    each eight either eleven else elsewhere empty enough even ever every
    everyone everything everywhere except
    few fifteen fifty first five for former formerly forty four from front full
    further
    get give go
    had has have he hence her here hereafter hereby herein hereupon hers herself
    him himself his how however hundred
    i if in indeed into is it its itself
    keep
    last latter latterly least less
    just
    made make many may me meanwhile might mine more moreover most mostly move much
    must my myself
    name namely neither never nevertheless next nine now
    of off often on once one only onto or other others otherwise our ours ourselves
    out over own
    part per perhaps please put
    quite
    rather re really regarding
    same say see seem seemed seeming seems serious several she should show side
    since six sixty so some somehow someone something sometime sometimes somewhere
    still such
    take ten than that the their them themselves then thence there thereafter
    thereby therefore therein thereupon these they third this those though three
    through throughout thru thus to together too top toward towards twelve twenty
    two
    under until up unless upon us used using
    various very very via was we well were what whatever when whence whenever where
    whereafter whereas whereby wherein whereupon wherever whether which while
    whither who whoever whole whom whose why will with within without would
    yet you your yours yourself yourselves
    """.split()
)

customised_contractions = ["'d", "'ll", "'m", "'re", "'s", "'ve"]
CUSTOMISED_STOP_WORDS.update(customised_contractions)

In [2]:
# run the commands below in terminal to install dependencies
# pip install spacy
# python -m spacy download en
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import string

nlp_sw = English()
nlp_n = spacy.load("en_core_web_sm")

removable_char = list(string.punctuation) + ["\n", "\r"]

# text arguement in string
# remove stop words of the provided text
# return a list of words in small letters in the text without stop words and punctuations
def remove_stop_words(text):
    text = text.lower()
    token_doc = nlp_sw(text)
    clean_text = []
    for token in token_doc:
        token_text = token.text
        token_text = token_text.replace(" ", "")
        token_text = "not" if token_text == "n't" else token_text
        if not(token_text in CUSTOMISED_STOP_WORDS or token_text in removable_char):
            if token_text and not(token_text.isspace()):
                tmp_text = token_text[1:] if token_text[0] in removable_char else token_text
                if tmp_text and not(tmp_text.isspace()):
                    clean_text.append(tmp_text)
    return clean_text

# text arguement in string
# normalise the text
# return a list of words in small letters in the normalised text
def normalise_text(text):
    text = text.lower()
    token_doc = nlp_n(text)
    normalised_text = []
    for token in token_doc:
        lemma = token.lemma_
        if not(lemma == "-PRON-"):
            normalised_text.append(lemma)
    return normalised_text

In [4]:
import pandas as pd
df = pd.read_csv('training_data.csv')
df

Unnamed: 0,Label,Sentences
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1599994,4,Just woke up. Having no school is the best fee...
1599995,4,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,Happy 38th Birthday to my boo of alll time!!! ...


In [8]:
# 0 = negative, 2 = neutral, 4 = positive
number_of_data = 1500000
data_negative = df[df.Label == 0][:int(number_of_data/2)]
data_positive = df[df.Label == 4][:int(number_of_data/2)]
data_positive["Label"] = 1
df_row_merged = pd.concat([data_negative, data_positive], ignore_index=True)
df_row_merged

Unnamed: 0,Label,Sentences
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1499995,1,"... im so happy for 1,2 ,3 ...oh 20 days my mr..."
1499996,1,Enjoying In Wonderful Day In Vegas!!! It's SOO...
1499997,1,"everybody else sleeping, but me.. no im not go..."
1499998,1,@gley10 get on aim?


In [9]:
clean_data = df_row_merged.sample(frac=1).reset_index(drop=True)
clean_data

Unnamed: 0,Label,Sentences
0,0,I think i hate summer.......
1,1,time for subway and a run!
2,0,i feel terrible.. i am definitely sick.
3,0,I can't wait to see UP! How dare @katymoe have...
4,0,battery is low_20%_#iphone 3g is a realy batte...
...,...,...
1499995,1,enjoying all the new but old dub and funk i've...
1499996,1,The weather is awesome http://twitpic.com/6cbuc
1499997,1,with Margaret
1499998,0,@TheDannyNoriega you and chris are so sweet yo...


In [10]:
# remove url in tweets
# remove tag (@) in tweets
# remove hashtags (#) in tweets
# remove stopwords
# normalise text
# this cell might take some time to run, be patient
import re
import string

url_pattern = re.compile(".*https?:\/\/")
hashtag_pattern = re.compile("#[a-z\d-]+")
tag_pattern = re.compile("^@")
punc = string.punctuation
punc = punc.replace("-", "")
punc += "“”"
punc = punc.replace ("'","")

pd.options.mode.chained_assignment = None

def process_text(text):
    tmp_no_url_hashtag = []
    for word in re.split(' |\r|\n', text):
        if url_pattern.match(word) or tag_pattern.match(word) or hashtag_pattern.match(word):
            continue
        else:
            clean_word = word.translate(str.maketrans("","", punc))
            clean_word = clean_word.replace("’","'")
            tmp_no_url_hashtag.append(clean_word)
    tmp = " ".join(tmp_no_url_hashtag)
    without_stopwords = " ".join(remove_stop_words(tmp))
    processed = " ".join(normalise_text(without_stopwords))
    return processed

for i in range (len(clean_data["Sentences"])):
    clean_data["Sentences"][i] = process_text(clean_data["Sentences"][i])
    if i % 1000 == 0:
        print ("Processing", i, "unit")

Processing 0 unit
Processing 1000 unit
Processing 2000 unit
Processing 3000 unit
Processing 4000 unit
Processing 5000 unit
Processing 6000 unit
Processing 7000 unit
Processing 8000 unit
Processing 9000 unit
Processing 10000 unit
Processing 11000 unit
Processing 12000 unit
Processing 13000 unit
Processing 14000 unit
Processing 15000 unit
Processing 16000 unit
Processing 17000 unit
Processing 18000 unit
Processing 19000 unit
Processing 20000 unit
Processing 21000 unit
Processing 22000 unit
Processing 23000 unit
Processing 24000 unit
Processing 25000 unit
Processing 26000 unit
Processing 27000 unit
Processing 28000 unit
Processing 29000 unit
Processing 30000 unit
Processing 31000 unit
Processing 32000 unit
Processing 33000 unit
Processing 34000 unit
Processing 35000 unit
Processing 36000 unit
Processing 37000 unit
Processing 38000 unit
Processing 39000 unit
Processing 40000 unit
Processing 41000 unit
Processing 42000 unit
Processing 43000 unit
Processing 44000 unit
Processing 45000 unit
P

Processing 361000 unit
Processing 362000 unit
Processing 363000 unit
Processing 364000 unit
Processing 365000 unit
Processing 366000 unit
Processing 367000 unit
Processing 368000 unit
Processing 369000 unit
Processing 370000 unit
Processing 371000 unit
Processing 372000 unit
Processing 373000 unit
Processing 374000 unit
Processing 375000 unit
Processing 376000 unit
Processing 377000 unit
Processing 378000 unit
Processing 379000 unit
Processing 380000 unit
Processing 381000 unit
Processing 382000 unit
Processing 383000 unit
Processing 384000 unit
Processing 385000 unit
Processing 386000 unit
Processing 387000 unit
Processing 388000 unit
Processing 389000 unit
Processing 390000 unit
Processing 391000 unit
Processing 392000 unit
Processing 393000 unit
Processing 394000 unit
Processing 395000 unit
Processing 396000 unit
Processing 397000 unit
Processing 398000 unit
Processing 399000 unit
Processing 400000 unit
Processing 401000 unit
Processing 402000 unit
Processing 403000 unit
Processing 

Processing 717000 unit
Processing 718000 unit
Processing 719000 unit
Processing 720000 unit
Processing 721000 unit
Processing 722000 unit
Processing 723000 unit
Processing 724000 unit
Processing 725000 unit
Processing 726000 unit
Processing 727000 unit
Processing 728000 unit
Processing 729000 unit
Processing 730000 unit
Processing 731000 unit
Processing 732000 unit
Processing 733000 unit
Processing 734000 unit
Processing 735000 unit
Processing 736000 unit
Processing 737000 unit
Processing 738000 unit
Processing 739000 unit
Processing 740000 unit
Processing 741000 unit
Processing 742000 unit
Processing 743000 unit
Processing 744000 unit
Processing 745000 unit
Processing 746000 unit
Processing 747000 unit
Processing 748000 unit
Processing 749000 unit
Processing 750000 unit
Processing 751000 unit
Processing 752000 unit
Processing 753000 unit
Processing 754000 unit
Processing 755000 unit
Processing 756000 unit
Processing 757000 unit
Processing 758000 unit
Processing 759000 unit
Processing 

Processing 1070000 unit
Processing 1071000 unit
Processing 1072000 unit
Processing 1073000 unit
Processing 1074000 unit
Processing 1075000 unit
Processing 1076000 unit
Processing 1077000 unit
Processing 1078000 unit
Processing 1079000 unit
Processing 1080000 unit
Processing 1081000 unit
Processing 1082000 unit
Processing 1083000 unit
Processing 1084000 unit
Processing 1085000 unit
Processing 1086000 unit
Processing 1087000 unit
Processing 1088000 unit
Processing 1089000 unit
Processing 1090000 unit
Processing 1091000 unit
Processing 1092000 unit
Processing 1093000 unit
Processing 1094000 unit
Processing 1095000 unit
Processing 1096000 unit
Processing 1097000 unit
Processing 1098000 unit
Processing 1099000 unit
Processing 1100000 unit
Processing 1101000 unit
Processing 1102000 unit
Processing 1103000 unit
Processing 1104000 unit
Processing 1105000 unit
Processing 1106000 unit
Processing 1107000 unit
Processing 1108000 unit
Processing 1109000 unit
Processing 1110000 unit
Processing 11110

Processing 1411000 unit
Processing 1412000 unit
Processing 1413000 unit
Processing 1414000 unit
Processing 1415000 unit
Processing 1416000 unit
Processing 1417000 unit
Processing 1418000 unit
Processing 1419000 unit
Processing 1420000 unit
Processing 1421000 unit
Processing 1422000 unit
Processing 1423000 unit
Processing 1424000 unit
Processing 1425000 unit
Processing 1426000 unit
Processing 1427000 unit
Processing 1428000 unit
Processing 1429000 unit
Processing 1430000 unit
Processing 1431000 unit
Processing 1432000 unit
Processing 1433000 unit
Processing 1434000 unit
Processing 1435000 unit
Processing 1436000 unit
Processing 1437000 unit
Processing 1438000 unit
Processing 1439000 unit
Processing 1440000 unit
Processing 1441000 unit
Processing 1442000 unit
Processing 1443000 unit
Processing 1444000 unit
Processing 1445000 unit
Processing 1446000 unit
Processing 1447000 unit
Processing 1448000 unit
Processing 1449000 unit
Processing 1450000 unit
Processing 1451000 unit
Processing 14520

In [11]:
clean_data

Unnamed: 0,Label,Sentences
0,0,think hate summer
1,1,time subway run
2,0,feel terrible definitely sick
3,0,not wait dare quotrealquot job probably go wai...
4,0,battery low20iphone 3 g realy battery killer
...,...,...
1499995,1,enjoy new old dub funk acquire
1499996,1,weather awesome
1499997,1,margaret
1499998,0,chris sweet weepy aww wish people not hard tim...


In [12]:
clean_data.to_csv('clean_data.csv',index=False)