# 1. Preliminary

## 1.1 Context

* We will analyze a very well known NLP dataset: tweets from disaster


* It is a Kaggle competition, which offers a simple but good level textual dataset to be able to make its weapons in NLP


* The dataset is here [https://www.kaggle.com/competitions/nlp-getting-started/data]


* Please use the **train** dataset


* In this 1st part we are going to clean the text

## 1.2 Requirements

You have to install  : 

* pandas
* numpy
* matplotlib
* seaborn


* nltk
* wordcloud
* pillow

## 1.3 Imports

In [1]:
# builtin
import os, sys, time, random


# data
import pandas as pd
import numpy as np


# NLP
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# import spacy


# viz
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image

# import plotly as px


## 1.4 Downloads and options

In [2]:
# download

nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package omw-1.4 to /home/alex/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/alex/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
# init sns

sns.set()

In [4]:
# init pandarallel

# pandarallel.initialize()

## 1.5 Loading data

In [5]:
# our file

data = "./data/cleaned/"
os.listdir(data)

['min_10_words.csv',
 'df.csv',
 'final_df.csv',
 'df_cleaned.csv',
 'unique_words.csv',
 'finad_df.csv',
 'min_5_words.csv']

In [6]:
# load dataframe

fn = data + 'df_cleaned.csv'
df = pd.read_csv(fn)
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


# 2. Work on a specific document

In [7]:
# select a random document

doc = df.text.sample(1)
doc = doc.values[0]
doc

"@kirkmin after listening to you demolish @BartHubbuch on @weei I can't wait to bait my patriot hater co-workers into a Brady discussion"

## 2.1 Lower

In [29]:
# lower

doc = doc.lower()
doc

"@kirkmin after listening to you demolish @barthubbuch on @weei i can't wait to bait my patriot hater co-workers into a brady discussion"

## 2.2 Tokenization

In [9]:
# tokenize

tokens = word_tokenize(doc)
tokens

['@',
 'kirkmin',
 'after',
 'listening',
 'to',
 'you',
 'demolish',
 '@',
 'barthubbuch',
 'on',
 '@',
 'weei',
 'i',
 'ca',
 "n't",
 'wait',
 'to',
 'bait',
 'my',
 'patriot',
 'hater',
 'co-workers',
 'into',
 'a',
 'brady',
 'discussion']

In [10]:
len(tokens)

26

In [11]:
# an other tokenize

tokens = wordpunct_tokenize(doc)
tokens

['@',
 'kirkmin',
 'after',
 'listening',
 'to',
 'you',
 'demolish',
 '@',
 'barthubbuch',
 'on',
 '@',
 'weei',
 'i',
 'can',
 "'",
 't',
 'wait',
 'to',
 'bait',
 'my',
 'patriot',
 'hater',
 'co',
 '-',
 'workers',
 'into',
 'a',
 'brady',
 'discussion']

In [12]:
len(tokens)

29

## 2.3 Stopwords

In [13]:
# stop_words

stop_words = set(stopwords.words('english'))
print(stop_words)

{'off', "should've", 'shan', 'i', 'myself', "you'll", 'o', 'my', "couldn't", 'which', 'me', "didn't", 'wasn', "wasn't", 'these', 'doing', 'this', 'below', 'her', 'be', 'too', "weren't", 'your', 'they', 'that', 'most', 'with', 'who', 'had', 'she', 'the', 'd', 'doesn', 'our', 'being', 'further', 'we', 'other', 'shouldn', 'having', 'hasn', 'through', 'all', 'what', 'their', 'those', 'of', 'each', 'wouldn', 'but', 'did', "mustn't", "shan't", 'down', 'have', "it's", "doesn't", 'before', 'his', 'very', 'few', 'any', 'on', 'hers', "aren't", 'own', 'aren', 'can', 'whom', 'nor', "don't", 'itself', 'hadn', 'ma', 'ours', 'here', "shouldn't", 'after', 'am', 'was', 'if', 'between', "won't", 'an', 'both', 'couldn', 'while', 'for', 'it', 'will', 'only', 'again', 'him', 'how', 'isn', 'against', 'about', 'to', "isn't", 'is', 'and', "that'll", 'are', 'than', 'needn', "needn't", "she's", 'so', 'same', 'themselves', 'now', 'll', 'didn', 'there', 'from', 'don', 'you', 'where', 'out', "you're", 'a', 'does',

In [14]:
tokens = [w for w in tokens if w not in stop_words]
tokens

['@',
 'kirkmin',
 'listening',
 'demolish',
 '@',
 'barthubbuch',
 '@',
 'weei',
 "'",
 'wait',
 'bait',
 'patriot',
 'hater',
 'co',
 '-',
 'workers',
 'brady',
 'discussion']

In [15]:
len(tokens)

18

In [16]:
# an other tokensizer

tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(doc)
tokens

['kirkmin',
 'after',
 'listening',
 'to',
 'you',
 'demolish',
 'barthubbuch',
 'on',
 'weei',
 'i',
 'can',
 't',
 'wait',
 'to',
 'bait',
 'my',
 'patriot',
 'hater',
 'co',
 'workers',
 'into',
 'a',
 'brady',
 'discussion']

In [17]:
len(tokens)

24

In [18]:
# remove stopwords

tokens = [w for w in tokens if w not in stop_words]
tokens

['kirkmin',
 'listening',
 'demolish',
 'barthubbuch',
 'weei',
 'wait',
 'bait',
 'patriot',
 'hater',
 'co',
 'workers',
 'brady',
 'discussion']

In [19]:
len(tokens)

13

## 2.4 First cleaning function

In [20]:
def process_text_1(doc, rejoin=False) : 
    """basic function of text processing """
    
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # stop words
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    if rejoin : 
        return " ".join(cleaned_tokens_list)
    
    return cleaned_tokens_list

In [21]:
process_text_1(doc)

['kirkmin',
 'listening',
 'demolish',
 'barthubbuch',
 'weei',
 'wait',
 'bait',
 'patriot',
 'hater',
 'co',
 'workers',
 'brady',
 'discussion']

# 3. Working on the entire corpus

## 3.1 Build raw corpus

In [22]:
# join all corpus

raw_corpus = "".join(df.text.values)
raw_corpus[:1_000]

"Our Deeds are the Reason of this #earthquake May ALLAH Forgive us allForest fire near La Ronge Sask. CanadaAll residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected13,000 people receive #wildfires evacuation orders in California Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school #RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areasI'm on top of the hill and I can see a fire in the woods...There's an emergency evacuation happening now in the building across the streetI'm afraid that the tornado is coming to our area...Three people died from the heat wave so farHaha South Tampa is getting flooded hah- WAIT A SECOND I LIVE IN SOUTH TAMPA WHAT AM I GONNA DO WHAT AM I GONNA DO FVCK #flooding#raining #flooding #Florida #TampaBay #

In [23]:
# process the corpus

corpus = process_text_1(raw_corpus)
corpus

['deeds',
 'reason',
 'earthquake',
 'may',
 'allah',
 'forgive',
 'us',
 'allforest',
 'fire',
 'near',
 'la',
 'ronge',
 'sask',
 'canadaall',
 'residents',
 'asked',
 'shelter',
 'place',
 'notified',
 'officers',
 'evacuation',
 'shelter',
 'place',
 'orders',
 'expected13',
 '000',
 'people',
 'receive',
 'wildfires',
 'evacuation',
 'orders',
 'california',
 'got',
 'sent',
 'photo',
 'ruby',
 'alaska',
 'smoke',
 'wildfires',
 'pours',
 'school',
 'rockyfire',
 'update',
 'california',
 'hwy',
 '20',
 'closed',
 'directions',
 'due',
 'lake',
 'county',
 'fire',
 'cafire',
 'wildfires',
 'flood',
 'disaster',
 'heavy',
 'rain',
 'causes',
 'flash',
 'flooding',
 'streets',
 'manitou',
 'colorado',
 'springs',
 'areasi',
 'top',
 'hill',
 'see',
 'fire',
 'woods',
 'emergency',
 'evacuation',
 'happening',
 'building',
 'across',
 'streeti',
 'afraid',
 'tornado',
 'coming',
 'area',
 'three',
 'people',
 'died',
 'heat',
 'wave',
 'farhaha',
 'south',
 'tampa',
 'getting',
 'flo

In [24]:
len(set(corpus))

22438

In [25]:
# value counts

tmp = pd.Series(corpus).value_counts()
tmp

co                  4703
http                4231
https                405
amp                  342
like                 341
                    ... 
destructiontruck       1
salvages               1
7b2wf6ovfk             1
newsrepublican         1
ymy4rskq3d             1
Length: 22438, dtype: int64

In [26]:
# visualization

# sns.barplot(x=tmp.index, y=tmp.values)

In [27]:
# 30st most common tokens

tmp.head(30)

co            4703
http          4231
https          405
amp            342
like           341
û_             289
fire           230
get            226
via            216
2              204
people         189
new            183
one            181
news           166
emergency      145
disaster       143
video          136
would          133
body           127
police         122
still          120
3              119
u              117
crash          117
us             115
storm          114
back           113
day            112
know           112
california     110
dtype: int64

In [28]:
# 30st last common tokens

tmp.tail(30)

tydxauuenqhow          1
developer              1
hld5xlywbncrackdown    1
lmwkjsycgj             1
danhrothschild         1
greed                  1
takecare               1
cinla1964              1
windowgatribble        1
contrasts              1
foreboding             1
expansive              1
divisions              1
saturation             1
hue                    1
qbmcsjavt0fall         1
homebuyer              1
miscalculation         1
mwjcdkthere            1
workspace              1
forsee                 1
badkitty               1
lt3dave                1
specs                  1
lore                   1
destructiontruck       1
salvages               1
7b2wf6ovfk             1
newsrepublican         1
ymy4rskq3d             1
dtype: int64

In [36]:
tmp.describe()

count    22438.000000
mean         3.680096
std         43.379216
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max       4703.000000
dtype: float64

In [None]:
# sns.displot(tmp)

In [None]:
# sns.boxplot(tmp)

## 3.2 List rare tokens

In [34]:
# unique words --> not usefull

tmp = pd.Series(corpus).value_counts()
list_unique_words = tmp[tmp==1]
list_unique_words[:30]

dub                   1
mxhrextrkh            1
ctijdpxabkdogs        1
splatling             1
foothill              1
designsso             1
thatrussianman        1
waterboarding         1
writingtips           1
salmanmydarling       1
ps3                   1
xboxhttps             1
qr1l2jyuez            1
nester                1
switching             1
dipping               1
pantherattackthere    1
dieanpink95           1
limitsabe             1
yu_nita99             1
sivan                 1
pantherattacki        1
camilla_33            1
uooygbb6az            1
akq4rwjfvlcheck       1
skippy6gaming         1
slttorrlhswho         1
craykain              1
lavalet               1
basalt                1
dtype: int64

In [35]:
len(list_unique_words)

16230

In [36]:
list_unique_words = list(list_unique_words.index)
list_unique_words[:30]

['dub',
 'mxhrextrkh',
 'ctijdpxabkdogs',
 'splatling',
 'foothill',
 'designsso',
 'thatrussianman',
 'waterboarding',
 'writingtips',
 'salmanmydarling',
 'ps3',
 'xboxhttps',
 'qr1l2jyuez',
 'nester',
 'switching',
 'dipping',
 'pantherattackthere',
 'dieanpink95',
 'limitsabe',
 'yu_nita99',
 'sivan',
 'pantherattacki',
 'camilla_33',
 'uooygbb6az',
 'akq4rwjfvlcheck',
 'skippy6gaming',
 'slttorrlhswho',
 'craykain',
 'lavalet',
 'basalt']

In [37]:
# save it for later

tmp = pd.DataFrame({"words" : list_unique_words})
tmp.to_csv("data/cleaned/unique_words.csv", index=False)

In [38]:
tmp = pd.Series(corpus).value_counts()
list_min_5_words = tmp[tmp<=5]
list_min_5_words[:30]

motorcycle     5
blind          5
ices           5
remain         5
md             5
mental         5
loves          5
depth          5
extra          5
leaves         5
subs           5
judge          5
earners        5
operations     5
reduced        5
catch          5
stephen        5
quest          5
reviews        5
responsible    5
motor          5
flying         5
smithsonian    5
52             5
34             5
losses         5
desires        5
pulls          5
mood           5
tubestrike     5
dtype: int64

In [39]:
len(list_min_5_words)

20275

In [40]:
tmp = pd.Series(corpus).value_counts()
list_min_10_words = tmp[tmp<=10]
list_min_10_words[:30]

able           10
trees          10
complete       10
udhampur       10
seattle        10
word           10
michael        10
yyc            10
amazon         10
grows          10
jeb            10
afghanistan    10
picture        10
abandoned      10
ice            10
main           10
emotional      10
sit            10
colour         10
nice           10
tent           10
extreme        10
lmao           10
ii             10
loved          10
seeks          10
extremely      10
issue          10
either         10
incident       10
dtype: int64

In [41]:
len(list_min_10_words)

21158

In [42]:
list_min_10_words = list(list_min_10_words.index)
tmp = pd.DataFrame({"words" : list_min_10_words})
tmp.to_csv("data/cleaned/min_10_words.csv", index=False)

## 3.3 2nd Cleaning function

In [43]:
def process_text_2(doc, 
                   rejoin=False, 
                   list_rare_words=None, 
                   min_len_word=3,
                   force_is_alpha=True) : 
    """cf process_text_1 but with list_unique_words, min_len_word, and force_is_alpha
    
    positional arguments : 
    -----------------------
    doc : str : the document (aka a text in str format) to process
    
    opt args : 
    -----------------------
    rejoin : bool : if True return a string else return the list of tokens
    list_rare_words : list : a list of rare words to exclude
    min_len_word : int : the minimum length of words to not exclude
    force_is_alpha : int : if 1, exclude all tokens with a numeric character
    
    return : 
    ------------------------
    a string (if rejoin is True) or a list of tokens
    """
    
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # classics stopwords
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    # no rare tokens
    non_rare_tokens = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # no more len words
    more_than_N =  [w for w in non_rare_tokens if len(w) >= min_len_word  ]
    
    # only alpha chars
    if force_is_alpha : 
        alpha_tokens = [w for w in more_than_N if w.isalpha()]
    else :
        alpha_tokens = more_than_N
    
    # manage return type
    if rejoin : 
        return " ".join(alpha_tokens)
    
    return alpha_tokens

In [44]:
pd.Series(corpus).sample(30)

73555            poses
15604             http
72983               ar
37751               im
60177             http
74331            movie
66272        literally
39889         fatality
35225            fully
67158         hakogaku
61846    thedailybeast
69915         offshoot
17760               co
75013             back
76977           player
8887             blood
60890            three
28283            loose
55098             done
49290        shipwreck
31766          migrant
17099            cliff
17267              tax
11638            years
27593             free
14047    hksbmijqz1the
63950           rubble
76384             http
55482         disaster
57141     obliteration
dtype: object

In [45]:
len(set(corpus))

22438

In [46]:
corpus = process_text_2(raw_corpus, list_rare_words=list_unique_words, rejoin=False)
pd.Series(corpus).sample(30)

15618          ass
5429         began
46730        wanna
11444     chemical
29903      service
36959       barack
34411         data
46785         onto
29520       blames
36166     murderer
54571     solitude
6275     landscape
34324      extreme
16929          sat
45595         sure
55663       debate
50772     released
52779      landing
18731      destroy
53294      provide
35126          bed
51968     southern
51042    survivors
24368     evacuate
6054         blood
36939         wait
53989    whirlwind
31874         drop
55244      gunfire
2998         calif
dtype: object

In [47]:
len(set(corpus))

5705

## 3.4 Stem and Lem

In [None]:
txt = ""

## 3.5 3rd cleaning function

In [50]:
def process_text_3(doc, 
                   rejoin=False, 
                   lemm_or_stemm="stem",
                   list_rare_words=None, 
                   min_len_word=3,
                   force_is_alpha=True) : 
    """cf process_text_1 but with list_unique_words, min_len_word, and force_is_alpha
    
    positional arguments : 
    -----------------------
    doc : str : the document (aka a text in str format) to process
    
    opt args : 
    -----------------------
    rejoin : bool : if True return a string else return the list of tokens
    lemm_or_stemm : str : if lem do lemmentize else stemmentize  
    list_rare_words : list : a list of rare words to exclude
    min_len_word : int : the minimum length of words to not exclude
    force_is_alpha : int : if 1, exclude all tokens with a numeric character
    
    return : 
    ------------------------
    a string (if rejoin is True) or a list of tokens
    """
    
 
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # classics stopwords
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    # no rare tokens
    non_rare_tokens = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # no more len words
    more_than_N =  [w for w in non_rare_tokens if len(w) >= min_len_word  ]
    
    # only alpha chars
    if force_is_alpha : 
        alpha_tokens = [w for w in more_than_N if w.isalpha()]
    else :
        alpha_tokens = more_than_N

    # stem or lem
    if lemm_or_stemm == "lem" : 
        trans = WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_tokens ]
    else : 
        trans = PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_tokens ]
        
    # manage return type
    if rejoin : 
        return " ".join(trans_text)
    
    return trans_text
    

In [51]:
corpus = process_text_3(raw_corpus, rejoin=False, list_rare_words=list_unique_words)
pd.Series(corpus).sample(30)

34563        much
23134        much
18076       accid
21768        year
1181     frontlin
29056     myanmar
11994        came
35844       worst
24902         ûïa
24820       evacu
44506        hell
13519        bear
44816        http
40752        http
5143         give
40073        long
40190      stress
52354       dixon
2803         http
54656        year
20387    starbuck
32042        dijk
3152         miss
41880        http
35482        news
49757     thunder
547         right
19620       senso
39781        bout
4845        write
dtype: object

In [52]:
len(set(corpus))

4420

In [53]:
pd.Series( words.words()).sample(30)

74211           functionalize
143269               perruche
6345                   amazed
16870                   axled
13385                arnberry
143098              permeable
93718     individualistically
196251               sybarist
162692         radiotelephone
137092                 pajock
89533            hyperkinetic
14221                  Asilus
186016     spectrobolographic
80644                gudesire
83266                 hebetic
101107             kanephoros
44589              courthouse
63182         epanisognathous
96666       interprotoplasmic
114701                  metad
27279                  bulbar
142138              perceiver
182970            sleepwalker
58807               dynamotor
45669             Cristivomer
17502           bacteriophage
104489            laryngotome
129940                   odso
7991             anchoretical
4343                ailantine
dtype: object

## 3.5 Only english words

In [54]:
len(set(words.words()))

235892

In [55]:
eng_words = [i.lower() for i in words.words()]
eng_words

['a',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic',
 'aaronical',
 'aaronite',
 'aaronitic',
 'aaru',
 'ab',
 'aba',
 'ababdeh',
 'ababua',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'abadite',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'abama',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'abanic',
 'abantes',
 'abaptiston',
 'abarambo',
 'abaris',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'abasgi',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'abassin',
 'abastardize',
 'abatable',
 'abate

In [56]:
len(set(eng_words))

234377

In [57]:
ps = PorterStemmer()
eng_words_stem = [ps.stem(i) for i in eng_words]
pd.Series(eng_words_stem).sample(30)

109607              macrandr
68000                   fake
233636             wreakless
155595            priestshir
154971           presignific
40093               commonli
214780              undertel
223078               unsolid
158327         prudentialist
166309               rejuven
122140                  naga
67248                 extern
32811                cercele
160068               punctur
89530          hyperkeratosi
6626             ametabolian
141291                peixer
69665                fidelio
77973     glossolabiopharyng
40603                concaus
2820            adrenalectom
60793               elotillo
208547                truthi
113152            megalopsia
70643              flangeway
21746                  bhaga
72415                foresin
182018            siphonogam
15229                atmolyz
83822             hemalbumen
dtype: object

In [58]:
len(set(eng_words_stem))

178311

In [59]:
lm = WordNetLemmatizer()
eng_words_lem = [lm.lemmatize(i) for i in eng_words]
pd.Series(eng_words_lem).sample(30)

116336          milliner
219309           unmated
8167        androphagous
65504            eunomia
117414      misprofessor
82610         hartmannia
215709         unemulous
200170    teratoblastoma
22476         binominous
74150             fumago
92746       incatenation
169849          ridicule
113113      megalocornea
134459         overcloud
118948        monopteral
57801           dregless
236615          surprise
120008     mountainwards
3375          aetosaurus
134480    overcompensate
45002         craniology
222616         unsevered
50679         demiourgoi
184787           solvend
54500      disappreciate
160447         pussyfoot
32994      certificative
75096           gambeson
226909           vanadyl
174326             scall
dtype: object

In [60]:
len(eng_words_lem)

236736

In [None]:
def process_text_4(doc, rejoin=True, lemm_or_stemm = "stem", list_rare_words=None, min_len_word=3, eng_words=None) : 
    """df v3 but with only valid english word"""
 
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower and strip
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # remove stop words
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    # drop rare tokens
    non_rare_tokens_list = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # keep only len word > N
    more_than_N =  [w for w in non_rare_tokens_list if len(w) >= 3 ]
    
    # keep only alpha not num
    alpha_num = [w for w in more_than_N if w.isalpha()]
    
    # stem or lem
    if lemm_or_stemm == "lem" : 
        trans = WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_num ]
    else : 
        trans = PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_num ]
        
    # in english 
    if eng_words :
        engl_text = [i for i in trans_text if i in eng_words]
    else :
        engl_text = trans_text
        
    #  return a list or a string
    if rejoin : 
        return " ".join(engl_text)
    
    return engl_text

In [None]:
corpus = process_text_4(raw_corpus, rejoin=False, list_rare_words=list_unique_words, eng_words=eng_words_stem)
corpus[:30]

In [None]:
len(set(corpus))

In [None]:
len(df)

In [None]:
list_unique_words[:30]

In [None]:
len(list_unique_words)

In [None]:
list_min_5_words[:30]

In [None]:
len(list_min_5_words)

In [None]:
corpus = process_text_4(raw_corpus, rejoin=False, list_rare_words=list_min_5_words, eng_words=eng_words_stem)
corpus[:30]

In [None]:
len(set(corpus))

In [None]:
tmp = pd.Series(corpus).value_counts()
tmp

In [None]:
# sns.barplot(tmp.index, tmp.values)

In [None]:
wordcloud = WordCloud(background_color = 'white', stopwords = [], max_words = 50).generate(" ".join(corpus))
plt.imshow(wordcloud)
plt.axis("off")
plt.show();

## by category

In [None]:
df_1 = df[df.target == 1]
df_0 = df[df.target == 0]

In [None]:
corpus_1 = " ".join(df_1.text)
corpus_0 = " ".join(df_0.text)

In [None]:
corpus_1 = process_text_4(corpus_1, rejoin=False, list_rare_words=list_min_5_words, eng_words=eng_words_stem)
corpus_0 = process_text_4(corpus_0, rejoin=False, list_rare_words=list_min_5_words, eng_words=eng_words_stem)

In [None]:
wordcloud = WordCloud(background_color = 'white', stopwords = [], max_words = 50).generate(" ".join(corpus_1))
plt.imshow(wordcloud)
plt.axis("off")
plt.show();

In [None]:
wordcloud = WordCloud(background_color = 'white', stopwords = [], max_words = 50).generate(" ".join(corpus_0))
plt.imshow(wordcloud)
plt.axis("off")
plt.show();

In [None]:
pd.Series(corpus_1).value_counts().head(20)

In [None]:
pd.Series(corpus_0).value_counts().head(20)

In [None]:
[i for i in pd.Series(corpus_1).value_counts().head(10).index if i in pd.Series(corpus_0).value_counts().head(10).index]

In [None]:
def process_text_5(doc, rejoin=True, lemm_or_stemm = "stem", list_rare_words=None, min_len_word=3, eng_words=None) : 
    """df v4 but exclude amp"""
 
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower and strip
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # remove stop words
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    # drop rare tokens
    non_rare_tokens_list = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # keep only len word > N
    more_than_N =  [w for w in non_rare_tokens_list if len(w) >= 3 ]
    
    # keep only alpha not num
    alpha_num = [w for w in more_than_N if w.isalpha()]
    
    # stem or lem
    if lemm_or_stemm == "lem" : 
        trans = WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_num ]
    else : 
        trans = PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_num ]
        
    # in english 
    if eng_words :
        engl_text = [i for i in trans_text if i in eng_words]
    else :
        engl_text = trans_text
       
    # amp
    engl_text = [i for i in engl_text if i!="amp"]
        
    #  return a list or a string
    if rejoin : 
        return " ".join(engl_text)
    
    return engl_text

In [None]:
def final_clean(doc) : 
    
    new_doc = process_text_5(doc,rejoin=True, stem_or_lem="lem", list_rare_words=list_min_5_words, eng_words=eng_words_lem)
    return  new_doc

In [None]:
df["clean_text"] = df.text.apply(final_clean)

In [None]:
df.isna().sum()

In [None]:
df

In [None]:
df.to_csv("data/cleaned/final_df.csv", index=False)