# 1) Data cleaning

### Read files into dataframe

In [53]:
import pandas as pd 
import os
import glob

path = os.getcwd()
folder_name = "data" 
xlsx_files = glob.glob(os.path.join(path+"/"+folder_name, "*.xlsx"))

li = []
col_names = ['score', 'username', 'review_fr', 'company', 'product', 'type', 'date_publication', 'date_exp', 'review_en', 'review_corrected_fr', 'review_corrected_en']
for f in xlsx_files:
    df = pd.read_excel(f, index_col=None, header=0, names=col_names)
    li.append(df)

dataframe = pd.concat(li, axis=0, ignore_index=True)
dataframe.head()


Unnamed: 0,score,username,review_fr,company,product,type,date_publication,date_exp,review_en,review_corrected_fr,review_corrected_en
0,,estelle-51227,j'ai quitté mon ancien contrat d'assurance che...,Néoliane Santé,sante,test,12/01/2017,01/01/2017,I left my former insurance contract at General...,,
1,,leadum-51107,j'ai souscrit à cette mutuelle l'année dernier...,Néoliane Santé,sante,test,09/01/2017,01/01/2017,I subscribed to this mutual a year last year a...,,
2,,enora-49520,"Impossible d'avoir le bon service , ils raccro...",Néoliane Santé,sante,test,24/11/2016,01/11/2016,"Impossible to have the right service, they han...",,
3,,bea-139295,Génération est une mutuelle très chère pour un...,Génération,sante,test,09/11/2021,01/11/2021,Generation is a very expensive mutual for a re...,,
4,,anna-139192,je viens d apprendre que je suis radié... j ap...,Génération,sante,test,08/11/2021,01/11/2021,I just learned that I am struck off ... I call...,,


### Data cleaning

In [54]:
print(dataframe.isna().sum())
total_rows = dataframe.shape[0]
print(f"Total number of rows: {total_rows}")

score                  10331
username                   1
review_fr                  0
company                    0
product                    0
type                       0
date_publication           0
date_exp                   0
review_en                  2
review_corrected_fr    34000
review_corrected_en    34004
dtype: int64
Total number of rows: 34435


Handling missing values

In [None]:
df.dropna(subset=['score', 'review_en'], inplace=True)

#remove columns not needed
df.drop(columns=['username', 'review_fr', 'company', 'product', 'type', 'date_publication', 'date_exp', 'review_corrected_fr', 'review_corrected_en'], inplace=True)

Lowercasing

In [57]:
def lowercase(frame):
    frame['review_en'] = frame['review_en'].map(lambda r: r.lower())

lowercase(df)

Remove punctuation

In [58]:
def remove_punctuation(df):
    df['review_en'] = df['review_en'].str.replace('[^\w\s]','')

remove_punctuation(df)

Spelling correction

In [59]:
# # pip install pyspellchecker
# from spellchecker import SpellChecker
# def spelling_correction(df):
#     spell = SpellChecker()
#     df['review_en'] = [[spell.correction(word) for word in row] for row in df['review_en'].str.split(" ").to_list()]
#     df['review_en'] = df['review_en'].apply(lambda x: " ".join(x))

# spelling_correction(df)

Tokenization

In [60]:
import nltk

def tokenize(df):
    # Adds a column 'tokenized_rev' that contains for each entry a python list of words contained in the review
    df['tokenized_rev'] = df.apply(lambda row: nltk.word_tokenize(row['review_en']), axis=1)

tokenize(df)

Stopwords removal

In [None]:
from nltk.corpus import stopwords

def remove_stopwords(df):
    stop = stopwords.words('english')
    df['review_without_stopwords'] = df['review_en'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

remove_stopwords(df)

Unnamed: 0,score,review_en,tokenized_rev,review_without_stopwords
0,4,new member and having been in contact with pop...,"[new, member, and, having, been, in, contact, ...","new member contact pope, completely satisfied ..."
1,1,"latable telephone service, impossibility of ha...","[latable, telephone, service, ,, impossibility...","latable telephone service, impossibility answe..."
2,5,i am very satisfied with all the services offe...,"[i, am, very, satisfied, with, all, the, servi...",satisfied services offered gmf. customer many ...
3,4,i am satisfied the price suits me very well i ...,"[i, am, satisfied, the, price, suits, me, very...",satisfied price suits well knew april thanks w...
4,4,"..i.... . nickel for the moment, no sinister w...","[.., i, ...., ., nickel, for, the, moment, ,, ...","..i.... . nickel moment, sinister see use..l h..."


Lemmatization