In [None]:
import pandas as pd
import pickle

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
df_raw = pd.read_csv('../data/raw/raw_x_y.csv')
df_raw.head()

Unnamed: 0,designation,description,productid,imageid,prdtypecode
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,10
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,2280
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,50
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,1280
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,2705


In [7]:
# Text preprocessing of the raw dataframe


# Merge 'designation' and 'description', handling NaN in 'description'
df_raw['text'] = df_raw['designation'] + ' ' + df_raw['description'].fillna('')

# Step 1: Remove special characters but keep letters, numbers, and spaces
df_raw['cleaned_text'] = df_raw['text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x) if pd.notnull(x) else ''))

# Step 2: Convert cleaned text to lower case
df_raw['cleaned_text'] = df_raw['cleaned_text'].apply(lambda x: x.lower() if isinstance(x, str) else x)

# Get stopwords for English and French
stop_words_eng = set(stopwords.words('english'))
stop_words_fr = set(stopwords.words('french'))

# Set custom stopwords 
custom_stopwords = set(["chez", "der", "plu", "haut", "peut", "non", "100", "produit", "lot", "tout", "cet", "cest", "sou", "san"])

# Combine both sets of stopwords
stop_words = stop_words_eng.union(stop_words_fr).union(custom_stopwords)

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize tokens
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# Function to remove stopwords from tokens
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Tokenize, lowercase, lemmatize, and remove stopwords
df_raw['lemmatized_text'] = df_raw['cleaned_text'].apply(
    lambda x: remove_stopwords(lemmatize_tokens(word_tokenize(x.lower())))
)

In [8]:
df_raw.head()

Unnamed: 0,designation,description,productid,imageid,prdtypecode,text,cleaned_text,lemmatized_text
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,olivia personalisiertes notizbuch 150 seiten ...,"[olivia, personalisiertes, notizbuch, 150, sei..."
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,journal des arts le n 133 du 28092001 lart et...,"[journal, art, 133, 28092001, lart, marche, sa..."
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,grand stylet ergonomique bleu gamepad nintendo...,"[grand, stylet, ergonomique, bleu, gamepad, ni..."
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,peluche donald europe disneyland 2000 marion...,"[peluche, donald, europe, disneyland, 2000, ma..."
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,2705,La Guerre Des Tuques Luc a des id&eacute;es de...,la guerre des tuques luc a des ideacutees de g...,"[guerre, tuques, luc, ideacutees, grandeur, ve..."


In [9]:
df_processed = df_raw.drop(['designation', 'description', 'productid', 'text'], axis=1)

# I keep both cleaned_text and lemmatized_text as I want to experiment a little further with the best option

In [None]:
# I save the entire df_processed to pkl
df_processed.to_pickle('../data/processed/df_text_processed.pkl')