# Cleaning some data
## Getting our hands dirty!! :)

In [1]:
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
import numpy as np
import pandas as pd
import re
import string
import warnings
from pandas.core.common import SettingWithCopyWarning

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


### Collect some data

In [2]:
data = pd.read_csv("/kaggle/input/edsa-sentiment-classification/train.csv", index_col = "tweetid")
data.head()
#change index to tweetid
#g = pd.read_csv("test_with_no_labels.csv")
#g.head()

Unnamed: 0_level_0,sentiment,message
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
625221,1,PolySciMajor EPA chief doesn't think carbon di...
126103,1,It's not like we lack evidence of anthropogeni...
698562,2,RT @RawStory: Researchers say we have three ye...
573736,1,#TodayinMaker# WIRED : 2016 was a pivotal year...
466954,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ..."


## let the washing begin!!

In [3]:
#rearrange the columns for clarity
data_sorted = data[["message", "sentiment"]]
data_sorted.head()

for i in data_sorted["message"]:
    print(i)

PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable
It's not like we lack evidence of anthropogenic global warming
RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…
#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD
RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight
Worth a read whether you do or don't believe in climate change https://t.co/ggLZVNYjun https://t.co/7AFE2mAH8j
RT @thenation: Mike Pence doesn’t believe in global warming or that smoking causes lung cancer. https://t.co/gvWYaauU8R
RT @makeandmendlife: Six big things we can ALL do today to fight climate change, or how to be a climate activistÃ¢â‚¬Â¦ https://t.co/TYMLu6DbNM hÃ¢â‚¬Â¦
@AceofSpadesHQ My 8yo nephew is inconsolable. He

In [4]:
#remove web urls 
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
data_sorted['message'] = data_sorted['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

for i in data_sorted["message"]:
    print(i)

PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? url-web via @mashable
It's not like we lack evidence of anthropogenic global warming
RT @RawStory: Researchers say we have three years to act on climate change before it’s too late url-web url-web…
#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change url-web
RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight
Worth a read whether you do or don't believe in climate change url-web url-web
RT @thenation: Mike Pence doesn’t believe in global warming or that smoking causes lung cancer. url-web
RT @makeandmendlife: Six big things we can ALL do today to fight climate change, or how to be a climate activistÃ¢â‚¬Â¦ url-web hÃ¢â‚¬Â¦
@AceofSpadesHQ My 8yo nephew is inconsolable. He wants to die of old age like me, but will perish in the fiery hellscape of climate change.
RT @paigetweedy: no offense… but

In [5]:
#make everything lower case
data_sorted['message'] = data_sorted['message'].str.lower()

data_sorted

Unnamed: 0_level_0,message,sentiment
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
625221,polyscimajor epa chief doesn't think carbon di...,1
126103,it's not like we lack evidence of anthropogeni...,1
698562,rt @rawstory: researchers say we have three ye...,2
573736,#todayinmaker# wired : 2016 was a pivotal year...,1
466954,"rt @soynoviodetodas: it's 2016, and a racist, ...",1
...,...,...
22001,rt @ezlusztig: they took down the material on ...,1
17856,rt @washingtonpost: how climate change could b...,2
384248,notiven: rt: nytimesworld :what does trump act...,0
819732,rt @sara8smiles: hey liberals the climate chan...,-1


In [6]:
#remove all punctuations

def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

data_sorted['message'] = data_sorted['message'].apply(remove_punctuation)
data_sorted

Unnamed: 0_level_0,message,sentiment
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
625221,polyscimajor epa chief doesnt think carbon dio...,1
126103,its not like we lack evidence of anthropogenic...,1
698562,rt rawstory researchers say we have three year...,2
573736,todayinmaker wired 2016 was a pivotal year in...,1
466954,rt soynoviodetodas its 2016 and a racist sexis...,1
...,...,...
22001,rt ezlusztig they took down the material on gl...,1
17856,rt washingtonpost how climate change could be ...,2
384248,notiven rt nytimesworld what does trump actual...,0
819732,rt sara8smiles hey liberals the climate change...,-1


In [7]:
#tokenisation to treat each word with respect
tokeniser = TreebankWordTokenizer()
data_sorted['message'] = data_sorted['message'].apply(tokeniser.tokenize)

data_sorted

Unnamed: 0_level_0,message,sentiment
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
625221,"[polyscimajor, epa, chief, doesnt, think, carb...",1
126103,"[its, not, like, we, lack, evidence, of, anthr...",1
698562,"[rt, rawstory, researchers, say, we, have, thr...",2
573736,"[todayinmaker, wired, 2016, was, a, pivotal, y...",1
466954,"[rt, soynoviodetodas, its, 2016, and, a, racis...",1
...,...,...
22001,"[rt, ezlusztig, they, took, down, the, materia...",1
17856,"[rt, washingtonpost, how, climate, change, cou...",2
384248,"[notiven, rt, nytimesworld, what, does, trump,...",0
819732,"[rt, sara8smiles, hey, liberals, the, climate,...",-1


In [8]:
#stem to transform to the root word, and have more common words
stemmer = SnowballStemmer('english')

def mbti_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

data_sorted['message'] = data_sorted['message'].apply(mbti_stemmer, args=(stemmer, ))

data_sorted

Unnamed: 0_level_0,message,sentiment
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
625221,"[polyscimajor, epa, chief, doesnt, think, carb...",1
126103,"[it, not, like, we, lack, evid, of, anthropoge...",1
698562,"[rt, rawstori, research, say, we, have, three,...",2
573736,"[todayinmak, wire, 2016, was, a, pivot, year, ...",1
466954,"[rt, soynoviodetoda, it, 2016, and, a, racist,...",1
...,...,...
22001,"[rt, ezlusztig, they, took, down, the, materi,...",1
17856,"[rt, washingtonpost, how, climat, chang, could...",2
384248,"[notiven, rt, nytimesworld, what, doe, trump, ...",0
819732,"[rt, sara8smil, hey, liber, the, climat, chang...",-1


In [9]:
#lemmatization to lessen unique words even more by changing tenses of words and plurals
lemmatizer = WordNetLemmatizer()

def mbti_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]

data_sorted['message'] = data_sorted['message'].apply(mbti_lemma, args=(lemmatizer, ))

data_sorted

Unnamed: 0_level_0,message,sentiment
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
625221,"[polyscimajor, epa, chief, doesnt, think, carb...",1
126103,"[it, not, like, we, lack, evid, of, anthropoge...",1
698562,"[rt, rawstori, research, say, we, have, three,...",2
573736,"[todayinmak, wire, 2016, wa, a, pivot, year, i...",1
466954,"[rt, soynoviodetoda, it, 2016, and, a, racist,...",1
...,...,...
22001,"[rt, ezlusztig, they, took, down, the, materi,...",1
17856,"[rt, washingtonpost, how, climat, chang, could...",2
384248,"[notiven, rt, nytimesworld, what, doe, trump, ...",0
819732,"[rt, sara8smil, hey, liber, the, climat, chang...",-1


In [10]:
#remove standard stop words, which are words of insignificance
def remove_stop_words(tokens):    
    return [t for t in tokens if t not in stopwords.words('english')]

data_sorted['message'] = data_sorted['message'].apply(remove_stop_words)

data_sorted

Unnamed: 0_level_0,message,sentiment
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
625221,"[polyscimajor, epa, chief, doesnt, think, carb...",1
126103,"[like, lack, evid, anthropogen, global, warm]",1
698562,"[rt, rawstori, research, say, three, year, act...",2
573736,"[todayinmak, wire, 2016, wa, pivot, year, war,...",1
466954,"[rt, soynoviodetoda, 2016, racist, sexist, cli...",1
...,...,...
22001,"[rt, ezlusztig, took, materi, global, warm, lg...",1
17856,"[rt, washingtonpost, climat, chang, could, bre...",2
384248,"[notiven, rt, nytimesworld, doe, trump, actual...",0
819732,"[rt, sara8smil, hey, liber, climat, chang, cra...",-1


# Now for some deep cleaning suggestions for alphanumeric words, emojis, and other word mixtures

In [11]:
for i in data_sorted["message"]:
    print(i)

['polyscimajor', 'epa', 'chief', 'doesnt', 'think', 'carbon', 'dioxid', 'main', 'caus', 'global', 'warm', 'wait', 'urlweb', 'via', 'mashabl']
['like', 'lack', 'evid', 'anthropogen', 'global', 'warm']
['rt', 'rawstori', 'research', 'say', 'three', 'year', 'act', 'climat', 'chang', 'befor', 'late', 'urlweb', 'urlweb…']
['todayinmak', 'wire', '2016', 'wa', 'pivot', 'year', 'war', 'climat', 'chang', 'urlweb']
['rt', 'soynoviodetoda', '2016', 'racist', 'sexist', 'climat', 'chang', 'deni', 'bigot', 'lead', 'poll', 'electionnight']
['worth', 'read', 'whether', 'dont', 'believ', 'climat', 'chang', 'urlweb', 'urlweb']
['rt', 'thenat', 'mike', 'penc', 'believ', 'global', 'warm', 'smoke', 'caus', 'lung', 'cancer', 'urlweb']
['rt', 'makeandmendlif', 'six', 'big', 'thing', 'today', 'fight', 'climat', 'chang', 'climat', 'activistã¢â‚¬â¦', 'urlweb', 'hã¢â‚¬â¦']
['aceofspadeshq', '8yo', 'nephew', 'inconsol', 'want', 'die', 'old', 'age', 'like', 'perish', 'fieri', 'hellscap', 'climat', 'chang']
['rt', 