In [None]:
## Notebook introducing the words_n_fun module
# Copyright (C) <2018-2022>  <Agence Data Services, DSI Pôle Emploi>
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

Error: Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.8.0 64-bit.

<h1>Introduction</h1>

<h2>Tutorial notebook of the preprocessing module.</h2>

This notebook highlights how to use preprocessing features of the words_n_fun module on a given text corpus. 

**Given sample file** : Job offers sample (file csv xxx.csv)

**Notebook parts** :
<ul>
    <li>Import required modules 
            *NB* : Don't forget the GIT  url</li>
    <li>Import the input data :  load the csv file containing the input data  
            *NB* : Don't forget to provide the file path</li>
    <li>Preprocessing</li>
        <ul>
            <li>Extend or limit the list of stopwords with use case specific words  
                    *NB* : Following an ad hoc analysis we can add words to the stopwords list (words that will be removed from the corpus)</li>
            <li>Preprocessing on the corpus  
                    As an example, we display the 3 first rows from the corpus to get a before/after picture of the data</li>
            <li>Analysis of each preprocessing step on a sample document 
                    As an illustration, we display the resulting text after each step
                </li>
        </ul>
</ul>

**GIT** : TO BE DEFINED

**Entrypoint** : words_n_fun.preprocessing.api

# Import required modules 

In [1]:
#import the preprocessing module :
#---------------------------------------
import sys
import os

from words_n_fun.preprocessing import api as preprocessing

  from pandas import Panel


In [2]:
#import pandas 
#---------------------------------------
import pandas as pd

# Import data to process

#### Import the sample dataset

Package xlrd is required -> pip install xlrd, then restart this notebook

In [3]:
#NB : Don't forget to input the file path.
dir_path = os.path.dirname(os.path.realpath('__file__'))
file_path = os.path.join(dir_path, "essai1_ocr_formacodes_output.xlsx")
df = pd.read_excel(file_path, sheet_name='ocr_formacodes_output')

In [4]:
#Displays the first 3 rows of the dataset :
df.head(3)

Unnamed: 0,id,titre,description,catégories,formacode 1,formacode 2,formacode 3,formacode 4,formacode 5,formacode 6,formacode 7,formacode 8,formacode 9,formacode 10
0,19980,Apprenez à programmer en C !,Le C est un langage incontournable qui en a in...,Développement pour l'entreprise,30882.0,30854.0,31088.0,,,,,,,
1,26832,Apprenez à programmer en Java,Java est un langage extrêmement populaire util...,Développement pour l'entreprise,30802.0,30854.0,31088.0,,,,,,,
2,43538,Reprenez le contrôle à l'aide de Linux !,Linux est un système d'exploitation qui fait t...,Systèmes et réseaux,31021.0,31032.0,31054.0,,,,,,,


In [5]:
#Shape of the dataset (rows, columns) :
df.shape

(248, 14)

#### The preprocessing will be applied to the "description" column

In [6]:
docs=df["description"]

# Preprocessing

<p>Here we specify the preprocessing pipeline that we'll use.</p>
<p>These transformations will be applied in the same order in which they are specified, but 
    we can chose to apply only a subset of these (be mindful of the relevance of each step) :</p>
<ul>
    <li>**remove_non_string** : Removes non string characters</li>
    
    <li>**to_lower_except_singleletters** : Lowercase transformation except for single letters tokens (such as the R or C in R language or C language)</li>
    <li>**remove_punct** : Returns a text without any punctuation</li>
    <li>**remove_stopwords** :  returns a text without stopwords</li>
</ul>

### Extending the stopwords list

Before running the preprocessing pipeline it is advised to **extend the stopwords list with words that are irrelevant
for this use case**

In [25]:
from tqdm import tqdm
tqdm.pandas()

In [26]:
#Runs listing_count_words on the whole corpus :
    #We can apply notnull, to_lower and remove_punct before running listing_count_words to tidy the text up
pipeline = ['remove_non_string', 'to_lower_except_singleletters','remove_punct','remove_stopwords']
docs_preprocess_count = preprocessing.preprocess_pipeline(docs, pipeline=pipeline)

count_words=preprocessing.listing_count_words(docs_preprocess_count)
count_words.head(3) #returns a dataFrame containig words and their frequency

[2019-09-20 10:33:47] - INFO: Preprocessing: étape remove_non_string


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 248/248 [00:00<00:00, 247958.85it/s]


[2019-09-20 10:33:47] - INFO: Preprocessing: étape to_lower_except_singleletters


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 248/248 [00:00<00:00, 61993.41it/s]


[2019-09-20 10:33:47] - INFO: Preprocessing: étape remove_punct
[2019-09-20 10:33:47] - INFO: Preprocessing: étape remove_stopwords


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 689/689 [00:00<00:00, 172231.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 157/157 [00:00<00:00, 156973.95it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 689/689 [00:00<00:00, 229646.81it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 157/157 [00:00<00:00, 78496.33it/s]


Unnamed: 0,word,count
0,0,1
1,1,1
2,10,3


In [27]:
# Displays the 10 most frequent words
count_words.sort_values(["count"], ascending= False ).head(10)

Unnamed: 0,word,count
428,cours,92
528,découvrez,43
1846,web,42
509,données,37
440,créer,33
117,application,29
1371,projet,27
131,apprenez,26
949,langage,25
118,applications,22


### Preprocessing on the whole corpus

In [None]:
#Sample :
docs=df["description"][0:10]

<p>Here we define the desired pipeline.</p>
<p>Transformations are applied in the same order in which they are specified :</p>
<ul>
    <li>**remove_non_string** : Removes non string characters</li>  
    <li>**get_true_spaces** : Replaces all white spaces with a single space</li>
        <li>**to_lower_except_singleletters** : Lower case transformation except for single letters (such as language R or language C)</li>
        
    <li>**pe_matching** : Basic one to one substitution 
        *Example* : "permis b" (french driving licence) => "permisb"</li>
    <li>**remove_gender_synonyms** : Finds occurences where both male and female versions of a single words are used (eg: Serveur/Serveuse) and keep only the male version (language convention)</li>
        
    <li>**remove_punct_except_parenthesis** :  Removes all non alphanumeric characters by whitespaces except for parenthesis</li>
    <li>**remove_numeric** : Returns a text without any numerical character</li>
    <li>**remove_stopwords** : Returns a text without stopwords</li>
    <li>**lemmatize** OU **stemmatize** : Text lemmatization or stemmatization
    <li>**remove_accents** : Returns a text without any accent</li>
    <li>**trim_string** : Replaces multiple white spaces by a single one</li>
    <li>**remove_leading_and_ending_spaces** : Removes leadining and trailing white spaces</li>
</ul>

In [None]:
#Pipeline definition :
pipeline = ['remove_non_string', 'get_true_spaces', 'to_lower_except_singleletters', 'pe_matching',
                    'remove_gender_synonyms', 'remove_punct_except_parenthesis', 'remove_numeric',
                    'remove_stopwords', 'stemmatize', 'remove_accents', 'trim_string', 'remove_leading_and_ending_spaces']

In [None]:
#Running the pipeline
docs_preprocess = preprocessing.preprocess_pipeline(docs,
                                                        pipeline=pipeline)
docs_preprocess.head(3)

In [None]:
#Displays the first rows :
for i in range(0,4) :
    print("Document index n°",i,"before preprocessing :")
    print("'",docs[i],"'")
    print("  and after preprocessing ")
    print("'",docs_preprocess[i],"'")

###  Diving into each single step

We only consider the first row of our initial dataset

In [None]:
text=docs[0]
text=pd.Series(text)
print(text.values)

In [None]:
pipeline = ['notnull', 'remove_non_string', 'to_lower_except_singleletters', 'pe_matching', 'trim_string',
                                        'remove_gender_synonyms', 'remove_punct_except_parenthesis', 'remove_numeric',
                                        'remove_stopwords','lemmatize', 'remove_accents']
def preprocess_pipeline_detail(text, pipeline=pipeline):
    print ("Texte initial")
    print (text.values)
    for item in pipeline:
        if item in preprocessing.USAGE.keys():
            print("\n")
            print(str(item))
            text=preprocessing.USAGE[item](text)
            print (text.values)
            #print("Etape %s" % item)
            #print(list(text.values))

In [None]:
preprocess_pipeline_detail(text,pipeline)