# 1. Preliminary

## 1.1 Context

* We will analyze a very well known NLP dataset: tweets from disaster


* It is a Kaggle competition, which offers a simple but good level textual dataset to be able to make its weapons in NLP


* The dataset is here [https://www.kaggle.com/competitions/nlp-getting-started/data]


* Please use the **train** dataset


* In this 1st part we are going to clean the text

## 1.2 Requirements

You have to install  : 

* pandas
* numpy
* matplotlib
* seaborn


* nltk
* wordcloud
* pillow

## 1.3 Imports

In [22]:
# builtin
import os, sys, time, random


# data
import pandas as pd
import numpy as np


# NLP
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# import spacy


# viz
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image

# import plotly as px


## 1.4 Downloads and options

In [23]:
# download

"""
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
"""

[nltk_data] Downloading package omw-1.4 to /home/alex/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/alex/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [24]:
# init sns

sns.set()

In [25]:
# init pandarallel

# pandarallel.initialize()

## 1.5 Loading data

In [26]:
# our file

data = "./data/cleaned/"
os.listdir(data)

['min_10_words.csv',
 'df.csv',
 'final_df.csv',
 'df_cleaned.csv',
 'unique_words.csv',
 'finad_df.csv',
 'min_5_words.csv']

In [27]:
# load dataframe

fn = data + 'df_cleaned.csv'
df = pd.read_csv(fn)
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


# 2. Work on a specific document

In [28]:
# select a random document

doc = df.text.sample(1)
doc = doc.values[0]
doc

'@holymileyray @moonIighthunty Focus on Me is going to obliterate careers tea'

## 2.1 Lower

In [29]:
# lower

doc = doc.lower()
doc

'@holymileyray @mooniighthunty focus on me is going to obliterate careers tea'

## 2.2 Tokenization

In [30]:
# tokenize

tokens = word_tokenize(doc)
tokens

['@',
 'holymileyray',
 '@',
 'mooniighthunty',
 'focus',
 'on',
 'me',
 'is',
 'going',
 'to',
 'obliterate',
 'careers',
 'tea']

In [None]:
len(tokens)

In [31]:
len(set(tokens))

12

In [None]:
def display_tokens_info(tokens) : 
    """display info about corpus """
    
    print(f"taille corpus {len(tokens)}, nb tokens uniques {len(set(tokens))}")
    print(tokens[:30])

In [33]:
# an other tokenize

tokens = wordpunct_tokenize(doc)
display_tokens_info(tokens)

['@',
 'holymileyray',
 '@',
 'mooniighthunty',
 'focus',
 'on',
 'me',
 'is',
 'going',
 'to',
 'obliterate',
 'careers',
 'tea']

## 2.3 Stopwords

In [36]:
# stop_words

stop_words = set(stopwords.words('english'))
print(stop_words)

{'hasn', "shan't", 'won', 'they', 'me', 'both', "haven't", 'them', 'at', 'of', 'did', "aren't", 'by', 'mightn', 'does', 'o', "wasn't", 'mustn', "shouldn't", 'some', 'wouldn', 'before', 'not', 'against', 'through', 'nor', 'i', 'other', 'didn', 'and', 'own', 'themselves', 'for', 'd', "she's", "doesn't", 'her', 'from', 'off', 'my', 'hers', 'have', 'between', "didn't", 'his', 'itself', 've', 'you', 'that', 'why', 'she', 'further', 'aren', "mustn't", 'been', 'there', 'hadn', 'whom', 'under', 'shouldn', 'again', 'same', 'few', 'all', 'if', 'should', 'ours', 'has', "weren't", 're', 'a', "won't", 'into', 'most', 'do', 'll', 'can', "don't", 'until', "needn't", "wouldn't", "hasn't", 'in', "you're", 'doing', 'are', 'or', 'don', 'am', 'being', 'those', 'no', 'only', 'isn', 'm', 'herself', 'as', 'with', 'its', 'their', 'your', "it's", 'here', 'any', 'after', 'yourselves', 'above', 'what', 'where', 'up', 'needn', 'our', 'out', 'yourself', 'how', 'will', 'during', 'it', 'because', 'ma', 'be', 'over',

In [37]:
tokens = [w for w in tokens if w not in stop_words]
display_tokens_info(tokens)

['@',
 'holymileyray',
 '@',
 'mooniighthunty',
 'focus',
 'going',
 'obliterate',
 'careers',
 'tea']

In [40]:
# an other tokensizer

tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(doc)
display_tokens_info(tokens)

['holymileyray',
 'mooniighthunty',
 'focus',
 'on',
 'me',
 'is',
 'going',
 'to',
 'obliterate',
 'careers',
 'tea']

In [43]:
# remove stopwords

tokens = [w for w in tokens if w not in stop_words]
display_tokens_info(tokens)

['holymileyray',
 'mooniighthunty',
 'focus',
 'going',
 'obliterate',
 'careers',
 'tea']

## 2.4 First cleaning function

In [46]:
def process_text_1(doc, rejoin=False) : 
    """basic function of text processing """
    
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # stop words
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    if rejoin : 
        return " ".join(cleaned_tokens_list)
    
    return cleaned_tokens_list

In [47]:
tokens = process_text_1(doc)
display_tokens_info(tokens)

['holymileyray',
 'mooniighthunty',
 'focus',
 'going',
 'obliterate',
 'careers',
 'tea']

# 3. Working on the entire corpus

## 3.1 Build raw corpus

In [48]:
# join all corpus

raw_corpus = "".join(df.text.values)
raw_corpus[:1_000]

"Our Deeds are the Reason of this #earthquake May ALLAH Forgive us allForest fire near La Ronge Sask. CanadaAll residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected13,000 people receive #wildfires evacuation orders in California Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school #RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areasI'm on top of the hill and I can see a fire in the woods...There's an emergency evacuation happening now in the building across the streetI'm afraid that the tornado is coming to our area...Three people died from the heat wave so farHaha South Tampa is getting flooded hah- WAIT A SECOND I LIVE IN SOUTH TAMPA WHAT AM I GONNA DO WHAT AM I GONNA DO FVCK #flooding#raining #flooding #Florida #TampaBay #

In [None]:
# process the corpus

corpus = process_text_1(raw_corpus)
display_tokens_info(corpus)

In [51]:
# value counts

tmp = pd.Series(corpus).value_counts()
tmp

co                  4703
http                4231
https                405
amp                  342
like                 341
                    ... 
destructiontruck       1
salvages               1
7b2wf6ovfk             1
newsrepublican         1
ymy4rskq3d             1
Length: 22438, dtype: int64

In [52]:
# visualization

# sns.barplot(x=tmp.index, y=tmp.values)

In [53]:
# 30st most common tokens

tmp.head(30)

co            4703
http          4231
https          405
amp            342
like           341
û_             289
fire           230
get            226
via            216
2              204
people         189
new            183
one            181
news           166
emergency      145
disaster       143
video          136
would          133
body           127
police         122
still          120
3              119
u              117
crash          117
us             115
storm          114
back           113
day            112
know           112
california     110
dtype: int64

In [54]:
# 30st last common tokens

tmp.tail(30)

tydxauuenqhow          1
developer              1
hld5xlywbncrackdown    1
lmwkjsycgj             1
danhrothschild         1
greed                  1
takecare               1
cinla1964              1
windowgatribble        1
contrasts              1
foreboding             1
expansive              1
divisions              1
saturation             1
hue                    1
qbmcsjavt0fall         1
homebuyer              1
miscalculation         1
mwjcdkthere            1
workspace              1
forsee                 1
badkitty               1
lt3dave                1
specs                  1
lore                   1
destructiontruck       1
salvages               1
7b2wf6ovfk             1
newsrepublican         1
ymy4rskq3d             1
dtype: int64

In [55]:
tmp.describe()

count    22438.000000
mean         3.680096
std         43.379216
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max       4703.000000
dtype: float64

In [56]:
# sns.displot(tmp)

In [57]:
# sns.boxplot(tmp)

## 3.2 List rare tokens

In [58]:
# unique words --> not usefull

tmp = pd.Series(corpus).value_counts()
list_unique_words = tmp[tmp==1]
list_unique_words[:30]

dub                   1
mxhrextrkh            1
ctijdpxabkdogs        1
splatling             1
foothill              1
designsso             1
thatrussianman        1
waterboarding         1
writingtips           1
salmanmydarling       1
ps3                   1
xboxhttps             1
qr1l2jyuez            1
nester                1
switching             1
dipping               1
pantherattackthere    1
dieanpink95           1
limitsabe             1
yu_nita99             1
sivan                 1
pantherattacki        1
camilla_33            1
uooygbb6az            1
akq4rwjfvlcheck       1
skippy6gaming         1
slttorrlhswho         1
craykain              1
lavalet               1
basalt                1
dtype: int64

In [59]:
len(list_unique_words)

16230

In [60]:
list_unique_words = list(list_unique_words.index)
list_unique_words[:30]

['dub',
 'mxhrextrkh',
 'ctijdpxabkdogs',
 'splatling',
 'foothill',
 'designsso',
 'thatrussianman',
 'waterboarding',
 'writingtips',
 'salmanmydarling',
 'ps3',
 'xboxhttps',
 'qr1l2jyuez',
 'nester',
 'switching',
 'dipping',
 'pantherattackthere',
 'dieanpink95',
 'limitsabe',
 'yu_nita99',
 'sivan',
 'pantherattacki',
 'camilla_33',
 'uooygbb6az',
 'akq4rwjfvlcheck',
 'skippy6gaming',
 'slttorrlhswho',
 'craykain',
 'lavalet',
 'basalt']

In [61]:
# save it for later

tmp = pd.DataFrame({"words" : list_unique_words})
tmp.to_csv("data/cleaned/unique_words.csv", index=False)

In [62]:
# idem for min 5 times

tmp = pd.Series(corpus).value_counts()
list_min_5_words = tmp[tmp<=5]
list_min_5_words[:30]

motorcycle     5
blind          5
ices           5
remain         5
md             5
mental         5
loves          5
depth          5
extra          5
leaves         5
subs           5
judge          5
earners        5
operations     5
reduced        5
catch          5
stephen        5
quest          5
reviews        5
responsible    5
motor          5
flying         5
smithsonian    5
52             5
34             5
losses         5
desires        5
pulls          5
mood           5
tubestrike     5
dtype: int64

In [63]:
len(list_min_5_words)

20275

In [None]:
# save it 

list_min_5_words = list(list_min_5_words.index)
tmp = pd.DataFrame({"words" : list_min_10_words})
tmp.to_csv("data/cleaned/min_5_words.csv", index=False)

In [64]:
# idem for min 10 times

tmp = pd.Series(corpus).value_counts()
list_min_10_words = tmp[tmp<=10]
list_min_10_words[:30]

able           10
trees          10
complete       10
udhampur       10
seattle        10
word           10
michael        10
yyc            10
amazon         10
grows          10
jeb            10
afghanistan    10
picture        10
abandoned      10
ice            10
main           10
emotional      10
sit            10
colour         10
nice           10
tent           10
extreme        10
lmao           10
ii             10
loved          10
seeks          10
extremely      10
issue          10
either         10
incident       10
dtype: int64

In [65]:
len(list_min_10_words)

21158

In [66]:
# save it 

list_min_10_words = list(list_min_10_words.index)
tmp = pd.DataFrame({"words" : list_min_10_words})
tmp.to_csv("data/cleaned/min_10_words.csv", index=False)

## 3.3 2nd Cleaning function

In [67]:
def process_text_2(doc, 
                   rejoin=False, 
                   list_rare_words=None, 
                   min_len_word=3,
                   force_is_alpha=True) : 
    """cf process_text_1 but with list_unique_words, min_len_word, and force_is_alpha
    
    positional arguments : 
    -----------------------
    doc : str : the document (aka a text in str format) to process
    
    opt args : 
    -----------------------
    rejoin : bool : if True return a string else return the list of tokens
    list_rare_words : list : a list of rare words to exclude
    min_len_word : int : the minimum length of words to not exclude
    force_is_alpha : int : if 1, exclude all tokens with a numeric character
    
    return : 
    ------------------------
    a string (if rejoin is True) or a list of tokens
    """
    
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # classics stopwords
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    
    ###########################################################
    ###########################################################
    
    # no rare tokens
    non_rare_tokens = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # no more len words
    more_than_N =  [w for w in non_rare_tokens if len(w) >= min_len_word  ]
    
    # only alpha chars
    if force_is_alpha : 
        alpha_tokens = [w for w in more_than_N if w.isalpha()]
    else :
        alpha_tokens = more_than_N
        
    ###########################################################
    ###########################################################     
    
    # manage return type
    if rejoin : 
        return " ".join(alpha_tokens)
    
    return alpha_tokens

In [68]:
pd.Series(corpus).sample(30)

8228            watched
76087                co
43740                 w
17831              hope
46693             heavy
49410                20
34516    rebeccaforreal
60426           content
14544              come
66576                3g
31278              days
49233          wireless
60798            toward
10628              spot
46025        skdbot7tgf
56456                co
58884             going
54144               gun
17824             munch
31182               fan
6224               http
49474         hurricane
8590        wealilknowa
71937             saudi
22098              goku
57098        auntiedote
199              ablaze
55878              http
57798                co
67625              june
dtype: object

In [69]:
len(set(corpus))

22438

In [70]:
corpus = process_text_2(raw_corpus, list_rare_words=list_unique_words, rejoin=False)
pd.Series(corpus).sample(30)

49348     hiroshima
42725          http
22656         every
52520           may
38232         plant
43245         would
37466      disaster
5492           blew
45691       another
6235           done
46257          know
34193        dunbar
43743          ruin
51061        summer
33625    government
13314         hills
23569         thurs
44754          http
44020           get
45517          haha
52130      declares
11669         china
22503      freezing
39113         times
18996      occurred
14896          soon
46312          time
31147     dangerous
55025        killed
14514          http
dtype: object

In [71]:
len(set(corpus))

5705

## 3.4 Stem and Lem

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meanings to one word. 

Stemming is the process of producing morphological variants of a root/base word. Stemming programs are commonly referred to as stemming algorithms or stemmers.

In [72]:
doc = "I have 3 dogs, they was all black. Now they are all white but one of my dog is my favorite"

In [73]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(doc.lower())
print(tokens)

['i', 'have', '3', 'dogs', 'they', 'was', 'all', 'black', 'now', 'they', 'are', 'all', 'white', 'but', 'one', 'of', 'my', 'dog', 'is', 'my', 'favorite']


In [74]:
trans = PorterStemmer()
trans_text = [trans.stem(i) for i in tokens ]
print(trans_text)

['i', 'have', '3', 'dog', 'they', 'wa', 'all', 'black', 'now', 'they', 'are', 'all', 'white', 'but', 'one', 'of', 'my', 'dog', 'is', 'my', 'favorit']


In [75]:
trans = WordNetLemmatizer()
trans_text = [trans.lemmatize(i) for i in tokens ]
print(trans_text)

['i', 'have', '3', 'dog', 'they', 'wa', 'all', 'black', 'now', 'they', 'are', 'all', 'white', 'but', 'one', 'of', 'my', 'dog', 'is', 'my', 'favorite']


## 3.5 3rd cleaning function

In [76]:
def process_text_3(doc, 
                   rejoin=False, 
                   lemm_or_stemm="stem",
                   list_rare_words=None, 
                   min_len_word=3,
                   force_is_alpha=True) : 
    """cf process_text_2 but with stemm or lem
    
    positional arguments : 
    -----------------------
    doc : str : the document (aka a text in str format) to process
    
    opt args : 
    -----------------------
    rejoin : bool : if True return a string else return the list of tokens
    lemm_or_stemm : str : if lem do lemmentize else stemmentize  
    list_rare_words : list : a list of rare words to exclude
    min_len_word : int : the minimum length of words to not exclude
    force_is_alpha : int : if 1, exclude all tokens with a numeric character
    
    return : 
    ------------------------
    a string (if rejoin is True) or a list of tokens
    """
    
 
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # classics stopwords
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    # no rare tokens
    non_rare_tokens = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # no more len words
    more_than_N =  [w for w in non_rare_tokens if len(w) >= min_len_word  ]
    
    # only alpha chars
    if force_is_alpha : 
        alpha_tokens = [w for w in more_than_N if w.isalpha()]
    else :
        alpha_tokens = more_than_N

    ###########################################################
    ###########################################################
    
    # stem or lem
    if lemm_or_stemm == "lem" : 
        trans = WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_tokens ]
    else : 
        trans = PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_tokens ]
        
     ###########################################################
     ###########################################################
    
    # manage return type
    if rejoin : 
        return " ".join(trans_text)
    
    return trans_text
    

In [77]:
corpus = process_text_3(raw_corpus, rejoin=False, list_rare_words=list_unique_words)
pd.Series(corpus).sample(30)

41284           updat
8440             http
10676            work
47264          turkey
40523          bounti
33228         florida
21271           creat
12937            work
36215           white
25981            ebay
31599             via
34891        restrict
35723           heard
51664            like
34420             add
14925            yall
52096    nasahurrican
12233         collaps
13166           learn
40377          gunman
48868            tree
8169             drop
55718           debat
41030            http
7457             bodi
26594           crash
22540      electrocut
18472        destruct
44370           shell
41974       entertain
dtype: object

In [78]:
len(set(corpus))

4420

In [79]:
pd.Series( words.words()).sample(30)

96512        interneciary
122872             neanic
36522        circularizer
133393            outcase
86589          Holodiscus
131767       orbicularity
24068           boarspear
230268             wallop
87285             hookman
47757         cypressroot
184913    somnambulically
66652              exodos
187461           sporange
216458         unfendered
214487          underpile
221689     unremunerating
22370             biltong
119952           mottling
96494              intern
75782       gastrostomize
65951             evestar
37720          clodhopper
188432        stalactitic
198029        tangleberry
228853         Vineyarder
234729          yohimbine
16889            axometry
181024           sickling
85818        Hildebrandic
187631           spraggly
dtype: object

## 3.5 Only english words

In [80]:
len(set(words.words()))

235892

In [81]:
eng_words = [i.lower() for i in words.words()]
eng_words[:30]

['a',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic',
 'aaronical',
 'aaronite',
 'aaronitic',
 'aaru',
 'ab',
 'aba',
 'ababdeh',
 'ababua',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'abadite',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'abama',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'abanic',
 'abantes',
 'abaptiston',
 'abarambo',
 'abaris',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'abasgi',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'abassin',
 'abastardize',
 'abatable',
 'abate

In [82]:
len(set(eng_words))

234377

In [83]:
ps = PorterStemmer()
eng_words_stem = [ps.stem(i) for i in eng_words]
pd.Series(eng_words_stem).sample(30)

43102        coriparian
112007            mason
183961          snipnos
56442         doctoress
155695          primula
167965         resplend
82035         hamirostr
29988          capitoul
170809          roomthi
86799             homer
43001            cordon
46130        crushingli
39945          commeddl
159326       psychosexu
81521           hackman
131471      opisthodomu
58063           drugman
101767            kevin
110079         magnetod
119298          moonris
35706      chromatophil
78925            gopher
42170        contradebt
76919          gesneria
211777    unbeseemingli
37980     clypeastridea
174736          schemat
94879               ink
179259          seventh
211589           unbapt
dtype: object

In [84]:
len(set(eng_words_stem))

178311

In [85]:
lm = WordNetLemmatizer()
eng_words_lem = [lm.lemmatize(i) for i in eng_words]
pd.Series(eng_words_lem).sample(30)

235216               ziamet
71941               footway
157428            proseucha
198911             taxiauto
229528        voicelessness
120803        munchausenize
59397           ecospecific
235366              zoarial
59102             easternly
17818                balaam
141882           pentaquine
17560                badaga
34200                chelys
140092          passivation
189697           sticktight
175996           scrobicule
101841                khaya
34958             chloremia
217874                 unie
57278             doughfoot
9208                 ansate
127523    nonsubstantiation
66982               explode
7356            amygdalitis
59737              eelspear
34465           chickenhood
19665             beaverish
146966                pined
223521          unstumbling
224499           untyrannic
dtype: object

In [86]:
len(eng_words_lem)

236736

## 3.6 4th cleaning function

In [87]:
def process_text_4(doc, 
                   rejoin=False, 
                   lemm_or_stemm="stem",
                   list_rare_words=None, 
                   min_len_word=3,
                   force_is_alpha=True, 
                   eng_words=None) : 
    """cf process_text_3 but with selection of only english words
    
    positional arguments : 
    -----------------------
    doc : str : the document (aka a text in str format) to process
    
    opt args : 
    -----------------------
    rejoin : bool : if True return a string else return the list of tokens
    lemm_or_stemm : str : if lem do lemmentize else stemmentize  
    list_rare_words : list : a list of rare words to exclude
    min_len_word : int : the minimum length of words to not exclude
    force_is_alpha : int : if 1, exclude all tokens with a numeric character
    eng_words : list : list of english words
    
    return : 
    ------------------------
    a string (if rejoin is True) or a list of tokens
    """
    
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # classics stopwords
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    # no rare tokens
    non_rare_tokens = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # no more len words
    more_than_N =  [w for w in non_rare_tokens if len(w) >= min_len_word  ]
    
    # only alpha chars
    if force_is_alpha : 
        alpha_tokens = [w for w in more_than_N if w.isalpha()]
    else :
        alpha_tokens = more_than_N

    # stem or lem
    if lemm_or_stemm == "lem" : 
        trans = WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_tokens ]
    else : 
        trans = PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_tokens ]

    ###########################################################
    ###########################################################
        
    # in english 
    if eng_words :
        engl_text = [i for i in trans_text if i in eng_words]
    else :
        engl_text = trans_text
    
    ###########################################################
    ###########################################################
        
    #  return a list or a string
    if rejoin : 
        return " ".join(engl_text)
    
    return engl_text
    

In [88]:
corpus = process_text_4(raw_corpus, rejoin=False, list_rare_words=list_unique_words, eng_words=eng_words_stem)
corpus[:30]

['deed',
 'reason',
 'earthquak',
 'may',
 'allah',
 'forgiv',
 'fire',
 'near',
 'resid',
 'ask',
 'shelter',
 'place',
 'offic',
 'evacu',
 'shelter',
 'place',
 'order',
 'peopl',
 'receiv',
 'wildfir',
 'evacu',
 'order',
 'california',
 'got',
 'sent',
 'photo',
 'alaska',
 'smoke',
 'wildfir',
 'school']

In [89]:
len(set(corpus))

3461

In [90]:
len(df)

7503

In [91]:
list_unique_words[:30]

['dub',
 'mxhrextrkh',
 'ctijdpxabkdogs',
 'splatling',
 'foothill',
 'designsso',
 'thatrussianman',
 'waterboarding',
 'writingtips',
 'salmanmydarling',
 'ps3',
 'xboxhttps',
 'qr1l2jyuez',
 'nester',
 'switching',
 'dipping',
 'pantherattackthere',
 'dieanpink95',
 'limitsabe',
 'yu_nita99',
 'sivan',
 'pantherattacki',
 'camilla_33',
 'uooygbb6az',
 'akq4rwjfvlcheck',
 'skippy6gaming',
 'slttorrlhswho',
 'craykain',
 'lavalet',
 'basalt']

In [92]:
len(list_unique_words)

16230

In [93]:
list_min_5_words[:30]

motorcycle     5
blind          5
ices           5
remain         5
md             5
mental         5
loves          5
depth          5
extra          5
leaves         5
subs           5
judge          5
earners        5
operations     5
reduced        5
catch          5
stephen        5
quest          5
reviews        5
responsible    5
motor          5
flying         5
smithsonian    5
52             5
34             5
losses         5
desires        5
pulls          5
mood           5
tubestrike     5
dtype: int64

In [94]:
len(list_min_5_words)

20275

In [95]:
corpus = process_text_4(raw_corpus, rejoin=False, list_rare_words=list_min_5_words, eng_words=eng_words_stem)
corpus[:30]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
len(set(corpus))

In [None]:
tmp = pd.Series(corpus).value_counts()
tmp

In [None]:
# sns.barplot(tmp.index, tmp.values)

## 3.7 Wordcloud

In [None]:
wordcloud = WordCloud(background_color='white', 
                      stopwords=[], 
                      max_words=50).generate(" ".join(corpus))
plt.imshow(wordcloud)
plt.axis("off")
plt.show();

# 4. Divide the corpus

## 4.1 Separate 0 / 1

In [None]:
df_1 = df[df.target == 1]
df_0 = df[df.target == 0]

In [None]:
corpus_1 = " ".join(df_1.text)
corpus_0 = " ".join(df_0.text)

## 4.2 Process boths of them

In [None]:
corpus_1 = process_text_4(corpus_1, 
                          rejoin=False, 
                          list_rare_words=list_min_5_words, 
                          eng_words=eng_words_stem)

corpus_0 = process_text_4(corpus_0, 
                          rejoin=False, 
                          list_rare_words=list_min_5_words, 
                          eng_words=eng_words_stem)

In [None]:
wordcloud = WordCloud(background_color='white', 
                      stopwords=[], 
                      max_words = 50).generate(" ".join(corpus_1))
plt.imshow(wordcloud)
plt.axis("off")
plt.show();

In [None]:
wordcloud = WordCloud(background_color='white', 
                      stopwords=[], max_words=50).generate(" ".join(corpus_0))
plt.imshow(wordcloud)
plt.axis("off")
plt.show();

In [None]:
pd.Series(corpus_1).value_counts().head(20)

In [None]:
pd.Series(corpus_0).value_counts().head(20)

In [None]:
[i for i in pd.Series(corpus_1).value_counts().head(10).index 
     if i in pd.Series(corpus_0).value_counts().head(10).index]

## 4.3 5th cleaning function

In [None]:
def process_text_5(doc, 
                   rejoin=True, 
                   lemm_or_stemm = "stem", 
                   list_rare_words=None, 
                   min_len_word=3, 
                   eng_words=None) : 
    """df v4 but exclude amp"""
 
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower and strip
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # remove stop words
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    # drop rare tokens
    non_rare_tokens_list = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # keep only len word > N
    more_than_N =  [w for w in non_rare_tokens_list if len(w) >= 3 ]
    
    # keep only alpha not num
    alpha_num = [w for w in more_than_N if w.isalpha()]
    
    # stem or lem
    if lemm_or_stemm == "lem" : 
        trans = WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_num ]
    else : 
        trans = PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_num ]
        
    # in english 
    if eng_words :
        engl_text = [i for i in trans_text if i in eng_words]
    else :
        engl_text = trans_text
        
    ##########################################
    ##########################################
    
    # amp
    engl_text = [i for i in engl_text if i!="amp"]
    
    ##########################################
    ##########################################
    
    #  return a list or a string
    if rejoin : 
        return " ".join(engl_text)
    
    return engl_text

# 5. Final clean

In [None]:
def final_clean(doc) : 
    
    new_doc = process_text_5(doc,rejoin=True, 
                             stem_or_lem="stem", 
                             list_rare_words=list_min_5_words, 
                             eng_words=eng_words_lem)
    return  new_doc

In [None]:
df["clean_text"] = df.text.apply(final_clean)

In [None]:
df.sample(20)

In [None]:
df.to_csv("data/cleaned/final_df.csv", index=False)