# 1. Preliminary

## 1.1 Context

* We will analyze a very well known NLP dataset: tweets from disaster


* It is a Kaggle competition, which offers a simple but good level textual dataset to be able to make its weapons in NLP


* The dataset is here [https://www.kaggle.com/competitions/nlp-getting-started/data]


* Please use the **train** dataset


* In this 1st part we are going to clean the text

## 1.2 Requirements

You have to install  : 

* pandas
* numpy
* matplotlib
* seaborn


* nltk
* wordcloud
* pillow


* pandarallel

## 1.3 Imports

In [None]:
# builtin
import os, sys, time, random


# data
import pandas as pd
import numpy as np


# NLP
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# import spacy


# viz
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image

# import plotly as px

from pandarallel import pandarallel

## 1.4 Downloads and options

In [None]:
# download

"""
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
"""

In [None]:
# pandarallel

pandarallel.initialize(progress_bar=True, 
                       nb_workers=6, 
                       # verbose=1
                      )

In [None]:
# init sns

sns.set()

In [None]:
# init pandarallel

# pandarallel.initialize()

## 1.5 Loading data

In [None]:
# our file

data = "./data/cleaned/"
os.listdir(data)

In [None]:
# load dataframe

fn = data + 'df_cleaned.csv'
df = pd.read_csv(fn)
df.head()

In [None]:
df.shape

In [None]:
df =df.sample(frac=0.3)


In [None]:
df.shape

# 2. Work on a specific document

In [None]:
# select a random document

doc = df.text.sample(1)
doc = doc.values[0]
doc

## 2.1 Lower

In [None]:
# lower

doc = doc.lower()
doc

## 2.2 Tokenization

In [None]:
# tokenize

tokens = word_tokenize(doc)
tokens

In [None]:
len(tokens)

In [None]:
len(set(tokens))

In [None]:
def display_tokens_info(tokens) : 
    """display info about corpus """
    
    print(f"taille corpus {len(tokens)}, nb tokens uniques {len(set(tokens))}")
    print(tokens[:30])

In [None]:
# an other tokenize

tokens = wordpunct_tokenize(doc)
display_tokens_info(tokens)

## 2.3 Stopwords

In [None]:
# stop_words

stop_words = set(stopwords.words('english'))
print(stop_words)

In [None]:
tokens = [w for w in tokens if w not in stop_words]
display_tokens_info(tokens)

In [None]:
# an other tokensizer

tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(doc)
display_tokens_info(tokens)

In [None]:
# remove stopwords

tokens = [w for w in tokens if w not in stop_words]
display_tokens_info(tokens)

## 2.4 First cleaning function

In [None]:
def process_text_1(doc, rejoin=False) : 
    """basic function of text processing """
    
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # stop words
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    if rejoin : 
        return " ".join(cleaned_tokens_list)
    
    return cleaned_tokens_list

In [None]:
tokens = process_text_1(doc)
display_tokens_info(tokens)

# 3. Working on the entire corpus

## 3.1 Build raw corpus

In [None]:
# join all corpus

raw_corpus = "".join(df.text.values)
raw_corpus[:1_000]

In [None]:
# process the corpus

corpus = process_text_1(raw_corpus)
display_tokens_info(corpus)

In [None]:
# value counts

tmp = pd.Series(corpus).value_counts()
tmp

In [None]:
# visualization

# sns.barplot(x=tmp.index, y=tmp.values)

In [None]:
# 30st most common tokens

tmp.head(30)

In [None]:
# 30st last common tokens

tmp.tail(30)

In [None]:
tmp.describe()

In [None]:
# sns.displot(tmp)

In [None]:
# sns.boxplot(tmp)

## 3.2 List rare tokens

In [None]:
# unique words --> not usefull

tmp = pd.Series(corpus).value_counts()
list_unique_words = tmp[tmp==1]
list_unique_words[:30]

In [None]:
len(list_unique_words)

In [None]:
list_unique_words = list(list_unique_words.index)
list_unique_words[:30]

In [None]:
# save it for later

tmp = pd.DataFrame({"words" : list_unique_words})
tmp.to_csv("data/cleaned/unique_words.csv", index=False)

In [None]:
# idem for min 5 times

tmp = pd.Series(corpus).value_counts()
list_min_5_words = tmp[tmp<=5]
list_min_5_words[:30]

In [None]:
len(list_min_5_words)

In [None]:
# save it 

list_min_5_words = list(list_min_5_words.index)
tmp = pd.DataFrame({"words" : list_min_5_words})
tmp.to_csv("data/cleaned/min_5_words.csv", index=False)

In [None]:
# idem for min 10 times

tmp = pd.Series(corpus).value_counts()
list_min_10_words = tmp[tmp<=10]
list_min_10_words[:30]

In [None]:
len(list_min_10_words)

In [None]:
# save it 

list_min_10_words = list(list_min_10_words.index)
tmp = pd.DataFrame({"words" : list_min_10_words})
tmp.to_csv("data/cleaned/min_10_words.csv", index=False)

## 3.3 2nd Cleaning function

In [None]:
def process_text_2(doc, 
                   rejoin=False, 
                   list_rare_words=None, 
                   min_len_word=3,
                   force_is_alpha=True) : 
    """cf process_text_1 but with list_unique_words, min_len_word, and force_is_alpha
    
    positional arguments : 
    -----------------------
    doc : str : the document (aka a text in str format) to process
    
    opt args : 
    -----------------------
    rejoin : bool : if True return a string else return the list of tokens
    list_rare_words : list : a list of rare words to exclude
    min_len_word : int : the minimum length of words to not exclude
    force_is_alpha : int : if 1, exclude all tokens with a numeric character
    
    return : 
    ------------------------
    a string (if rejoin is True) or a list of tokens
    """
    
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # classics stopwords
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    
    ###########################################################
    ###########################################################
    
    # no rare tokens
    non_rare_tokens = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # no more len words
    more_than_N =  [w for w in non_rare_tokens if len(w) >= min_len_word  ]
    
    # only alpha chars
    if force_is_alpha : 
        alpha_tokens = [w for w in more_than_N if w.isalpha()]
    else :
        alpha_tokens = more_than_N
        
    ###########################################################
    ###########################################################     
    
    
    # manage return type
    if rejoin : 
        return " ".join(alpha_tokens)
    
    return alpha_tokens

In [None]:
display_tokens_info(corpus)

In [None]:
len(set(corpus))

In [None]:
corpus = process_text_2(raw_corpus, 
                        list_rare_words=list_unique_words, 
                        rejoin=False)
display_tokens_info(corpus)

In [None]:
len(set(corpus))

## 3.4 Stem and Lem

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meanings to one word. 

Stemming is the process of producing morphological variants of a root/base word. Stemming programs are commonly referred to as stemming algorithms or stemmers.

In [None]:
doc = "I have 3 dogs, they was all black. Now they are all white but one of my dog is my favorite"

In [None]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(doc.lower())
print(tokens)

In [None]:
trans = PorterStemmer()
trans_text = [trans.stem(i) for i in tokens ]
print(trans_text)

In [None]:
trans = WordNetLemmatizer()
trans_text = [trans.lemmatize(i) for i in tokens ]
print(trans_text)

## 3.5 3rd cleaning function

In [None]:
def process_text_3(doc, 
                   rejoin=False, 
                   lemm_or_stemm="stem",
                   list_rare_words=None, 
                   min_len_word=3,
                   force_is_alpha=True) : 
    """cf process_text_2 but with stemm or lem
    
    positional arguments : 
    -----------------------
    doc : str : the document (aka a text in str format) to process
    
    opt args : 
    -----------------------
    rejoin : bool : if True return a string else return the list of tokens
    lemm_or_stemm : str : if lem do lemmentize else stemmentize  
    list_rare_words : list : a list of rare words to exclude
    min_len_word : int : the minimum length of words to not exclude
    force_is_alpha : int : if 1, exclude all tokens with a numeric character
    
    return : 
    ------------------------
    a string (if rejoin is True) or a list of tokens
    """
    
 
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # classics stopwords
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    # no rare tokens
    non_rare_tokens = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # no more len words
    more_than_N =  [w for w in non_rare_tokens if len(w) >= min_len_word  ]
    
    # only alpha chars
    if force_is_alpha : 
        alpha_tokens = [w for w in more_than_N if w.isalpha()]
    else :
        alpha_tokens = more_than_N

        
    ###########################################################
    ###########################################################
    
    # stem or lem
    if lemm_or_stemm == "lem" : 
        trans = WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_tokens ]
    else : 
        trans = PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_tokens ]
        
     ###########################################################
     ###########################################################
    
    
    # manage return type
    if rejoin : 
        return " ".join(trans_text)
    
    return trans_text
    

In [None]:
# +/- 3s

corpus = process_text_3(raw_corpus, rejoin=False, list_rare_words=list_unique_words)
pd.Series(corpus).sample(30)

In [None]:
len(set(corpus))

In [None]:
pd.Series( words.words()).sample(30)

## 3.5 Only english words

In [None]:
len(set(words.words()))

In [None]:
eng_words = [i.lower() for i in words.words()]
eng_words[:30]

In [None]:
len(set(eng_words))

In [None]:
ps = PorterStemmer()
eng_words_stem = [ps.stem(i) for i in eng_words]
display_tokens_info(eng_words_stem)

In [None]:
len(set(eng_words_stem))

In [None]:
lm = WordNetLemmatizer()
eng_words_lem = [lm.lemmatize(i) for i in eng_words]
display_tokens_info(eng_words_lem)

In [None]:
len(eng_words_lem)

## 3.6 4th cleaning function

In [None]:
def process_text_4(doc, 
                   rejoin=False, 
                   lemm_or_stemm="stem",
                   list_rare_words=None, 
                   min_len_word=3,
                   force_is_alpha=True, 
                   eng_words=None) : 
    """cf process_text_3 but with selection of only english words
    
    positional arguments : 
    -----------------------
    doc : str : the document (aka a text in str format) to process
    
    opt args : 
    -----------------------
    rejoin : bool : if True return a string else return the list of tokens
    lemm_or_stemm : str : if lem do lemmentize else stemmentize  
    list_rare_words : list : a list of rare words to exclude
    min_len_word : int : the minimum length of words to not exclude
    force_is_alpha : int : if 1, exclude all tokens with a numeric character
    eng_words : list : list of english words
    
    return : 
    ------------------------
    a string (if rejoin is True) or a list of tokens
    """
    
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # lower
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # classics stopwords
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    # no rare tokens
    non_rare_tokens = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # no more len words
    more_than_N =  [w for w in non_rare_tokens if len(w) >= min_len_word  ]
    
    # only alpha chars
    if force_is_alpha : 
        alpha_tokens = [w for w in more_than_N if w.isalpha()]
    else :
        alpha_tokens = more_than_N

    # stem or lem
    if lemm_or_stemm == "lem" : 
        trans = WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_tokens ]
    else : 
        trans = PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_tokens ]

        
    ###########################################################
    ###########################################################
        
    # in english 
    if eng_words :
        engl_text = [i for i in trans_text if i in eng_words]
    else :
        engl_text = trans_text
    
    ###########################################################
    ###########################################################
        
        
    #  return a list or a string
    if rejoin : 
        return " ".join(engl_text)
    
    return engl_text
    

In [None]:
# +/- 50s
corpus = process_text_4(raw_corpus, 
                        rejoin=False, 
                        list_rare_words=list_unique_words, 
                        eng_words=eng_words_stem)
display_tokens_info(corpus)

In [None]:
len(set(corpus))

In [None]:
len(df)

In [None]:
list_unique_words[:30]

In [None]:
len(list_unique_words)

In [None]:
list_min_5_words[:30]

In [None]:
len(list_min_5_words)

In [None]:
# +/- 30 sec
corpus = process_text_4(raw_corpus, 
                        rejoin=False, 
                        list_rare_words=list_min_5_words, 
                        eng_words=eng_words_stem)
display_tokens_info(corpus)

In [None]:
len(set(corpus))

In [None]:
tmp = pd.Series(corpus).value_counts()
tmp

In [None]:
# sns.barplot(tmp.index, tmp.values)

## 3.7 Wordcloud

In [None]:
wordcloud = WordCloud(background_color='white', 
                      stopwords=[], 
                      max_words=50).generate(" ".join(corpus))
plt.imshow(wordcloud)
plt.axis("off")
plt.show();

# 4. Divide the corpus

## 4.1 Separate 0 / 1

In [None]:
# df_0 df_1

df_1 = df[df.target == 1]
df_0 = df[df.target == 0]

In [None]:
df_0.head()

In [None]:
df_1.head()

In [None]:
corpus_1 = " ".join(df_1.text)
corpus_0 = " ".join(df_0.text)

In [None]:
corpus_1[:1000]

In [None]:
corpus_0[:1000]

## 4.2 Process boths of them

In [None]:
corpus_1 = process_text_4(corpus_1, 
                          rejoin=False, 
                          list_rare_words=list_min_5_words, 
                          eng_words=eng_words_stem)

corpus_0 = process_text_4(corpus_0, 
                          rejoin=False, 
                          list_rare_words=list_min_5_words, 
                          eng_words=eng_words_stem)

In [None]:
wordcloud = WordCloud(background_color='white', 
                      stopwords=[], 
                      max_words = 50).generate(" ".join(corpus_1))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(background_color='white', 
                      stopwords=[], max_words=50).generate(" ".join(corpus_0))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
pd.Series(corpus_1).value_counts().head(20)

In [None]:
pd.Series(corpus_0).value_counts().head(20)

In [None]:
n = 10
doublons = [i for i in pd.Series(corpus_1).value_counts().head(n).index 
     if i in pd.Series(corpus_0).value_counts().head(n).index]
doublons

In [None]:
n = 20
doublons = [i for i in pd.Series(corpus_1).value_counts().head(n).index 
     if i in pd.Series(corpus_0).value_counts().head(n).index]
doublons

## 4.3 5th cleaning function

In [None]:
def process_text_5(doc, 
                   rejoin=True, 
                   lemm_or_stemm = "stem", 
                   list_rare_words=None, 
                   min_len_word=3, 
                   eng_words=None, 
                   extra_words=None) : 
    """df v4 but exclude an extra list"""
 
    # list_unique_words
    if not list_rare_words: 
        list_rare_words = []
        
    # extra_words
    if not extra_words: 
        extra_words = []
        
    # lower and strip
    doc = doc.lower().strip()
    
    # tokenize
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # remove stop words
    cleaned_tokens_list = [w for w in raw_tokens_list if w not in stop_words]
    
    # drop rare tokens
    non_rare_tokens_list = [w for w in cleaned_tokens_list if w not in list_rare_words]
    
    # keep only len word > N
    more_than_N =  [w for w in non_rare_tokens_list if len(w) >= 3 ]
    
    # keep only alpha not num
    alpha_num = [w for w in more_than_N if w.isalpha()]
    
    # stem or lem
    if lemm_or_stemm == "lem" : 
        trans = WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_num ]
    else : 
        trans = PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_num ]
        
    # in english 
    if eng_words :
        engl_text = [i for i in trans_text if i in eng_words]
    else :
        engl_text = trans_text
        
        
    ##########################################
    ##########################################
    
    # drop extra_words tokens
    final = [w for w in engl_text if w not in extra_words]
    
    ##########################################
    ##########################################
    
    
    #  return a list or a string
    if rejoin : 
        return " ".join(final)
    
    return engl_text

# 5. Final clean

In [None]:
def final_clean(doc) : 
    """perform our final cleaning"""
    
    new_doc = process_text_5(doc,
                             rejoin=True, 
                             lemm_or_stemm="stem", 
                             list_rare_words=list_min_5_words, 
                             eng_words=eng_words_stem, 
                             extra_words=doublons)
    return  new_doc

In [None]:
# df["clean_text"] = df.text.apply(final_clean)

df["clean_text"] = df.text.parallel_apply(final_clean)

In [None]:
df.sample(20)

In [None]:
sub_df = df[df.target == 1]
for idx, ser in sub_df.sample(10).iterrows() : 
    print(ser['text'])
    print(ser['clean_text'])
    print()

In [None]:
sub_df = df[df.target == 0]
for idx, ser in sub_df.sample(10).iterrows() : 
    print(ser['text'])
    print(ser['clean_text'])
    print()

In [None]:
df.isna().mean()

In [None]:
df.to_csv("data/cleaned/final_df.csv", index=False)