In [95]:
import re
import pickle
import nltk

import numpy as np
import pandas as pd

from pathlib import Path
from abc import ABC, abstractmethod
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

assert nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fmore\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
path_src_dataset = Path("./data/src/spotify_tracks.csv")

df = pd.read_csv(path_src_dataset, nrows=10000) # Dataframe used to test functions, we can only take few rows

In [5]:
class Transformer(ABC, BaseEstimator, TransformerMixin):

    @abstractmethod
    def __init__(self):
        super().__init__()

    @abstractmethod
    def fit(self, X: pd.DataFrame, y=None):
        pass

    @abstractmethod
    def transform(self, X: pd.DataFrame):
        pass

# EXAMPLE OF TRANSFORMER FOR CLEANING / PROCESSING
class NewTransformer(Transformer):
    def __init__(self):
        #TODO
        pass

    def fit(self, X, y=None):
        
        #TODO

        return self

    
    def transform(self, X):
        
        #TODO

        return X

# Data Cleaning

Create a *Pipeline* which is a series of *Tranformers*.

## Transformers

In [None]:
class DropNaRate(Transformer):
    def __init__(self, rate: float):
        self.rate = rate

    def fit(self, X, y=None):
        
        perc_na = X.isna().sum()/X.shape[0]
        self.cols_to_drop: pd.Series = perc_na[perc_na > self.rate].index

        print(f">> (Info) Droped columns : {self.cols_to_drop.to_list()}")

        return self

    def transform(self, X):

        X = X.drop(columns = self.cols_to_drop)

        return X
    
### TEST ###

transfo = DropNaRate(0.02)
transfo.fit_transform(df)


In [None]:
class CleanTextData(Transformer):
    def __init__(
            self,
            columns: list[str],
            extended_stopwords_list: list=[],
            extended_ponctuation_pattern: re.Pattern[str] | str=r'',
        ):
        self.columns = columns
        self.extended_stopwords_list = extended_stopwords_list
        self.extended_ponctuation_pattern = extended_ponctuation_pattern

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        stopwords = nltk.corpus.stopwords.words(fileids=('english', 'spanish', 'french'))
        stopwords = [word.replace("\n","") for word in stopwords]
        stopwords.extend(self.extended_stopwords_list)

        punctuation_pattern = r'[\[\]()\-:;",/\.\.\.‘\'’?!“&]' + self.extended_ponctuation_pattern

        for col in self.columns:

            clean_cells = []
            for cell in X[col].to_list():
                cell: str

                # remove punctuation
                cell = cell.lower()
                cell = re.sub(punctuation_pattern, "", cell).replace("  "," ")

                # remove stopwords
                clean_text = [text for text in cell.split(" ") if text not in stopwords]
                clean_text = " ".join(clean_text)
                clean_cells.append(clean_text)
            
            X[col] = clean_cells

        print(f">> (Info) Punctuation and stopwords removed from columns {self.columns}")

        return X
    
### TEST ###

transfo = CleanTextData(["track_name"])
transfo.fit_transform(df)

## Pipeline for cleaning

In [None]:
path_src_dataset = Path("./data/src/spotify_tracks.csv")
out_folder_dataset = Path("./data/cleaned")
out_folder_config = Path("./data/cleaned/pipelines")

df = pd.read_csv(path_src_dataset)

### Create a new pipeline

In [89]:
pipeline = Pipeline(steps=[
    ("DropNaRate", DropNaRate(0.7)),
    ("CleanTextData", CleanTextData(["track_name"])),
    # ... Add others transformations
])

cleaned_df = pipeline.fit_transform(df)
# cleaned_df.head()

>> (Info) Droped columns : []
>> (Info) Punctuation and stopwords removed from columns ['track_name']


### Load an existing pipeline

In [90]:
pipeline_name = "TODELETE"

with open(out_folder_config / Path(pipeline_name + ".pkl"), 'rb') as file:
    pipeline: Pipeline = pickle.load(file)


cleaned_df = pipeline.fit_transform(df)
# cleaned_df.head()

>> (Info) Droped columns : []
>> (Info) Punctuation and stopwords removed from columns ['track_name']


### Save Cleaned Dataset + Pipeline

In [None]:
cleaned_df_name = "TODELETE"

cleaned_df.to_csv(out_folder_dataset / Path(cleaned_df_name + ".csv"))

# Writing to sample.json
with open(out_folder_config / Path(cleaned_df_name + ".pkl"), "wb") as file:
    pickle.dump(pipeline, file)