In [62]:
import pandas as pd
import numpy as np 
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import nltk
nltk.download('punkt')#
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
nltk.download('wordnet')#
nltk.download('stopwords') #
import sys  
#!{sys.executable} -m pip install pyspellchecker 
#from spellchecker import SpellChecker 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Cyrill\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Cyrill\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Cyrill\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


First we import the training data. We encode it in utf-8.

In [63]:
train = pd.read_csv('train.csv',encoding ="utf-8" )


In [64]:
#train.drop(["id"], axis=1, inplace=True)

We put all the characters to lowercase

In [65]:
train["text"]=train["text"].str.lower()

In [66]:
train.head(20)

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this #earthquake m...,1
1,4,,,forest fire near la ronge sask. canada,1
2,5,,,all residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,just got sent this photo from ruby #alaska as ...,1
5,8,,,#rockyfire update => california hwy. 20 closed...,1
6,10,,,#flood #disaster heavy rain causes flash flood...,1
7,13,,,i'm on top of the hill and i can see a fire in...,1
8,14,,,there's an emergency evacuation happening now ...,1
9,15,,,i'm afraid that the tornado is coming to our a...,1


We notice that the data contains: hashtags (# + text), usernames (@ + username), url's (http or https), contractions (for example: it's), ponctuation, citations that use apostrophes as citations marks ('shelter in place'), numbers and non alpha-numrical characters. In order to have a clean dataset we need to address these by:

<ul>
  <li>remove usernames, url's ponctuation, numbers, non aplha-numerical characters and apostrophes used as citation marks</li>
  <li>replace contractions by their uncontracted version</li>
  <li>remove the "#" sign from the hashtags, but keep the text (#wildfire => wildfire). </li>
</ul>

For the replacement of the contractions with their expansions we used a python dictionnary  provided by alko and arturomp @ stack overflow that found here: http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python. 


In [67]:
Contractions = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"gonna": "going to",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",


}

"""
       "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"

"""

'\n       "$" : " dollar ",\n    "€" : " euro ",\n    "4ao" : "for adults only",\n    "a.m" : "before midday",\n    "a3" : "anytime anywhere anyplace",\n    "aamof" : "as a matter of fact",\n    "acct" : "account",\n    "adih" : "another day in hell",\n    "afaic" : "as far as i am concerned",\n    "afaict" : "as far as i can tell",\n    "afaik" : "as far as i know",\n    "afair" : "as far as i remember",\n    "afk" : "away from keyboard",\n    "app" : "application",\n    "approx" : "approximately",\n    "apps" : "applications",\n    "asap" : "as soon as possible",\n    "asl" : "age, sex, location",\n    "atk" : "at the keyboard",\n    "ave." : "avenue",\n    "aymm" : "are you my mother",\n    "ayor" : "at your own risk", \n    "b&b" : "bed and breakfast",\n    "b+b" : "bed and breakfast",\n    "b.c" : "before christ",\n    "b2b" : "business to business",\n    "b2c" : "business to customer",\n    "b4" : "before",\n    "b4n" : "bye for now",\n    "b@u" : "back at you",\n    "bae" : "bef

In [68]:
def decontraction(total_text):
        
        for i in Contractions:
            total_text = total_text.replace( i, Contractions.get(i))
        
        return total_text

#apply decontraction function to column "text"
trainS['text'] = trainS['text'].map(decontraction)

In [69]:
def data_text_preprocess(total_text):
        
        #remove url's of type http or https
        total_text = re.sub('http\S+',' ', str(total_text))
        total_test = re.sub('https\S+',' ', str(total_text))
        #remove username
        total_text = re.sub('\@\S+',' ', str(total_text))
        #remove "#" sign but keep text of hashtag
        total_text = re.sub('\#',' ', str(total_text))
        #remove apostrophes used for citations, which are found at the beginnin or the end of words
        total_text = re.sub(r" \B'\b|\b'\B", " ", str(total_text))
        #keep only alphabetical values
        total_text = re.sub('[^a-z\n]', ' ', str(total_text))
       
        
       
        return total_text

In [70]:
train['text'] = train['text'].map(data_text_preprocess)
train.head(20)

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake m...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are b...,1
3,6,,,people receive wildfires evacuation or...,1
4,7,,,just got sent this photo from ruby alaska as ...,1
5,8,,,rockyfire update california hwy closed...,1
6,10,,,flood disaster heavy rain causes flash flood...,1
7,13,,,i m on top of the hill and i can see a fire in...,1
8,14,,,there s an emergency evacuation happening now ...,1
9,15,,,i m afraid that the tornado is coming to our a...,1


#### Lemmatization

After basic cleaning we can proceed by lemmatizing the text. In order to do the most effective lemmatization we need to identify the part-of-speech of every word (noun, verb, adjective, adverb). If we don't do this, a lot of verbs in present continuous tense (-ing) are identified as nouns ending with "ing"(e.g. uprising) and won't be lemmatized.

In [71]:
def get_wordnet_pos(word):
   
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [72]:
lemmatizer = WordNetLemmatizer()

def lemmatization1(total_text):
    #in this function we do tokenization and lemmatization simultaneously
    total_text = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(total_text)]

    return total_text

In [73]:
train['text'] = train['text'].map(lemmatization1)
train.head(20)

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deed, be, the, reason, of, this, earthqu...",1
1,4,,,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,,,"[all, resident, ask, to, shelter, in, place, b...",1
3,6,,,"[people, receive, wildfire, evacuation, order,...",1
4,7,,,"[just, get, sent, this, photo, from, ruby, ala...",1
5,8,,,"[rockyfire, update, california, hwy, close, in...",1
6,10,,,"[flood, disaster, heavy, rain, cause, flash, f...",1
7,13,,,"[i, m, on, top, of, the, hill, and, i, can, se...",1
8,14,,,"[there, s, an, emergency, evacuation, happen, ...",1
9,15,,,"[i, m, afraid, that, the, tornado, be, come, t...",1


Stopwords are non-relevant for our text analysis and will overload our word vectors, so we get rid of them by using a predefined stopword list provided by the nltk package.

In [74]:
stop_words = set(stopwords.words('english'))


In [75]:
def remove_stopwords(total_text):
    
    filtered_word_list = total_text[:]
    for word in total_text:
        if word in stop_words: 
            filtered_word_list.remove(word)
    
    #filtered_sentence = [w for w in total_text if not w in stop_words] 
    #filtered_sentence= []
    
    #for w in total_text: 
      #  if w not in stop_words: 
       #     filtered_sentence.append(w) 

    return filtered_word_list

In [76]:
train['text'] = train['text'].map(remove_stopwords)
train.head(20)

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[deed, reason, earthquake, may, allah, forgive...",1
1,4,,,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,,,"[resident, ask, shelter, place, notify, office...",1
3,6,,,"[people, receive, wildfire, evacuation, order,...",1
4,7,,,"[get, sent, photo, ruby, alaska, smoke, wildfi...",1
5,8,,,"[rockyfire, update, california, hwy, close, di...",1
6,10,,,"[flood, disaster, heavy, rain, cause, flash, f...",1
7,13,,,"[top, hill, see, fire, wood]",1
8,14,,,"[emergency, evacuation, happen, building, acro...",1
9,15,,,"[afraid, tornado, come, area]",1


We noticed that the cleaning we've done until now results in some tokens containing only one character. Since they haven't got any meaning, we remove them. 

In [77]:
def removesinglelettertoken(total_text):
    emptylist=[]
    for w in total_text:
        if len(w)>1:
            emptylist.append(w)
    return emptylist

train['text'] = train['text'].map(removesinglelettertoken)

In [78]:
#spell = SpellChecker()

#def spellchecker(total_text):
   # for w in total_text: 
     #   spell.correction(w) 
   # return total_text

After inspecting the whole dataset, we remarked that there are duplicates in the column "text". While some of them are perfect duplicates (column "text" AND column "target" are the same), others contain contradictions in the target column. This can lead to a less optimal estimation of the words' coefficients and an overall poorer score. How did we check if a group of duplicated tweets contains contradictions? We calculated the mean of the targets for each duplicate group. Means of 1 or 0 indicate perfect duplicates without contradictions. Meanwhile 0 < mean < 1 indicates that a duplicate group contains at least one contradiction. Imperfect duplicates that affect negatively our analysis are those, that have a close to or perfectly balanced number of 0 and 1 targets (e.g. 11 duplicates, 6 have target = 0 , 5 have target = 1). The more balanced the contradicting target values are, the closer the mean is to 0.5. We dediced to only keep imperfect duplicates with mean < 0.3 or 0.7 < mean.

In [79]:
#drop_duplicate function only works with type string or int, so we need to reconvert the list of tokens to a single string

def reconcatenate(total_text):
    TOTAL_text=''
    for w in total_text:
        TOTAL_text=TOTAL_text + w + ' '
    return TOTAL_text
        

In [80]:
train['text']=train['text'].map(reconcatenate)
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquake may allah forgive,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,resident ask shelter place notify officer evac...,1
3,6,,,people receive wildfire evacuation order calif...,1
4,7,,,get sent photo ruby alaska smoke wildfire pour...,1


In [81]:
train['mean'] = train.groupby('text')['target'].transform(np.mean)



#keep duplicates that have target-mean smaller than 0.3 or greater than 0.7
TRAIN=train[(train['mean']>0.7) | (train['mean']<0.3)]
TRAIN['mean']=TRAIN['mean'].map(round)
#keep only one entry of perfect duplicates
TRAIN.drop_duplicates(subset ="text", keep = 'first', inplace = True)
TRAIN.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,id,keyword,location,text,target,mean
0,1,,,deed reason earthquake may allah forgive,1,1
1,4,,,forest fire near la ronge sask canada,1,1
2,5,,,resident ask shelter place notify officer evac...,1,1
3,6,,,people receive wildfire evacuation order calif...,1,1
4,7,,,get sent photo ruby alaska smoke wildfire pour...,1,1


In [82]:
def retokenize(total_text):
    nltk_tokens = nltk.word_tokenize(total_text)
    return nltk_tokens

In [83]:
TRAIN['text']=TRAIN['text'].map(retokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [84]:
# Save to csv
TRAIN.to_csv("trainclean.csv", index=False, encoding = 'utf-8')

Now we will do same cleaning procedure (except for the duplicate treatment part) with the test data. 

In [85]:
#load test data and put to lowercase
test = pd.read_csv('test.csv',encoding ="utf-8" )
test["text"]=test["text"].str.lower()


In [86]:
for i in [decontraction, data_text_preprocess, lemmatization1, remove_stopwords, removesinglelettertoken]:
    test['text']=test["text"].map(i)




In [87]:
# Save to csv
test.to_csv("testclean.csv", index=False, encoding = 'utf-8')