In [None]:
import re
import pickle
import nltk

import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.svm import LinearSVC
from abc import ABC, abstractmethod
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler


assert nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fmore\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
path_src_dataset = Path("./data/cleaned/TODELETE.csv")

df = pd.read_csv(path_src_dataset, nrows=10000) # Dataframe used to test functions, we can only take few rows

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,7wrYBASu0OoxoDErd4Edxd,Bombay Jayashri,Rehnaa Hai Terre Dil Mein,zara zara,58,298266,False,0.643,0.268,...,-15.073,0,0.09,0.593,2e-06,0.316,0.62,143.813,4,classical
1,1,72HdutlIHBZJ7WT1xVAAZT,Shankar;Ehsaan;Loy;Alisha Chinai;Shankar Mahad...,,kajra,59,482586,False,0.484,0.898,...,-4.132,1,0.164,0.365,0.0,0.091,0.68,91.975,4,classical
2,2,7JGgKHHDgJCJkQCQxyHHdl,Bombay Jayashri;DJ Aftab,Hindi Slowed Reverb Bollywood Lofi,zara zara lofi,54,219437,False,0.608,0.638,...,-6.008,0,0.0292,0.581,0.0172,0.448,0.439,140.109,4,classical
3,3,3YRj4jmwois2ctPnhwSwFo,Bombay Jayashri,Minnalae,vaseegara,68,299146,False,0.695,0.293,...,-16.278,0,0.0431,0.596,0.0158,0.132,0.637,143.804,4,classical
4,4,3tp3ij9dtY3CacQgd1OvRf,Bombay Jayashri;Swattrex,Hindi LoFi Vibe,zara zara lofi chill,59,387716,False,0.583,0.308,...,-18.303,0,0.0465,0.581,0.0106,0.257,0.241,118.226,4,classical


In [6]:
class Transformer(ABC, BaseEstimator, TransformerMixin):

    @abstractmethod
    def __init__(self):
        super().__init__()

    @abstractmethod
    def fit(self, X: pd.DataFrame, y=None):
        pass

    @abstractmethod
    def transform(self, X: pd.DataFrame):
        pass

# EXAMPLE OF TRANSFORMER FOR CLEANING / PROCESSING
class NewTransformer(Transformer):
    def __init__(self):
        #TODO
        pass

    def fit(self, X, y=None):
        
        #TODO

        return self

    
    def transform(self, X):
        
        #TODO

        return X

# Data Processing

Create a *Pipeline* which is a series of *Tranformers*.

## Transformers

In [26]:
class SumCols(Transformer):
    def __init__(
            self,
            columns: list[str],
            weights: list[float]=[],
            new_col_name: str=None,
            remove_cols_in: bool=False,
        ):
        
        assert len(columns) > 1, ">> (ERROR - SumCols) 2 columns are required"
        self.columns = columns
        
        assert len(weights) == 0 or len(weights) == len(self.columns), ">> (ERROR - SumCols) columns and weights must have same dimensions."
        self.weights = weights if len(weights) == 0 else [1]*len(columns)

        self.new_col_name = new_col_name if new_col_name is not None else "_+_".join(self.columns)
        self.remove_cols_in = remove_cols_in

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X[self.new_col_name] = np.dot(X[self.columns], self.weights)

        print(f">> (INFO - SumCols) columns {self.columns} has been sumed in a new column : {self.new_col_name}")


        if self.remove_cols_in:
            X = X.drop(columns=self.columns)

        return X
    
### TEST ###

transfo = SumCols(["duration_ms", "popularity"], [0, 2], remove_cols_in=True)
df_test = transfo.fit_transform(df)
df_test.head()

>> (INFO - SumCols) columns ['duration_ms', 'popularity'] has been sumed in a new column : duration_ms_+_popularity


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,duration_ms_+_popularity
0,0,7wrYBASu0OoxoDErd4Edxd,Bombay Jayashri,Rehnaa Hai Terre Dil Mein,zara zara,False,0.643,0.268,11,-15.073,0,0.09,0.593,2e-06,0.316,0.62,143.813,4,classical,298324
1,1,72HdutlIHBZJ7WT1xVAAZT,Shankar;Ehsaan;Loy;Alisha Chinai;Shankar Mahad...,,kajra,False,0.484,0.898,0,-4.132,1,0.164,0.365,0.0,0.091,0.68,91.975,4,classical,482645
2,2,7JGgKHHDgJCJkQCQxyHHdl,Bombay Jayashri;DJ Aftab,Hindi Slowed Reverb Bollywood Lofi,zara zara lofi,False,0.608,0.638,11,-6.008,0,0.0292,0.581,0.0172,0.448,0.439,140.109,4,classical,219491
3,3,3YRj4jmwois2ctPnhwSwFo,Bombay Jayashri,Minnalae,vaseegara,False,0.695,0.293,11,-16.278,0,0.0431,0.596,0.0158,0.132,0.637,143.804,4,classical,299214
4,4,3tp3ij9dtY3CacQgd1OvRf,Bombay Jayashri;Swattrex,Hindi LoFi Vibe,zara zara lofi chill,False,0.583,0.308,7,-18.303,0,0.0465,0.581,0.0106,0.257,0.241,118.226,4,classical,387775


In [None]:
class PartialOneHotEncoder(Transformer):
    """partial because only some columns can be selected for encoding."""    
    
    def __init__(
            self,
            columns: list[str],
            *,
            categories="auto",
            drop='if_binary',
            handle_unknown="ignore",
            min_frequency=None,
            max_categories=None,
        ):
        self.columns = columns

        self.encoder = OneHotEncoder(
            categories=categories,
            drop=drop,
            sparse_output=False,
            handle_unknown=handle_unknown,
            min_frequency=min_frequency,
            max_categories=max_categories,
        )


    def fit(self, X, y=None):

        self.encoder = self.encoder.fit(X[self.columns])

        return self

    def transform(self, X):

        X_one_hot_encoded= self.encoder.transform(X[self.columns])

        X_one_hot_df = pd.DataFrame(X_one_hot_encoded, columns=self.encoder.get_feature_names_out())

        X = pd.concat([df.drop(self.columns, axis=1), X_one_hot_df], axis=1)

        print(f">> (INFO - PartialOneHotEncoder) {self.encoder.feature_names_in_} features one hot encoded as : {self.encoder.get_feature_names_out()}")

        return X
    
### TEST ###

transfo = PartialOneHotEncoder(columns=["explicit"])
df_test = transfo.fit_transform(df)
df_test.head()

>> (INFO - PartialOneHotEncoder) ['explicit'] features one hot encoded as : ['explicit_False' 'explicit_True' 'explicit_nan']


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,time_signature,track_genre,duration_ms_+_popularity,explicit_False,explicit_True,explicit_nan
0,0,7wrYBASu0OoxoDErd4Edxd,Bombay Jayashri,Rehnaa Hai Terre Dil Mein,zara zara,58,298266,0.643,0.268,11,...,2e-06,0.316,0.62,143.813,4,classical,298324,1.0,0.0,0.0
1,1,72HdutlIHBZJ7WT1xVAAZT,Shankar;Ehsaan;Loy;Alisha Chinai;Shankar Mahad...,,kajra,59,482586,0.484,0.898,0,...,0.0,0.091,0.68,91.975,4,classical,482645,1.0,0.0,0.0
2,2,7JGgKHHDgJCJkQCQxyHHdl,Bombay Jayashri;DJ Aftab,Hindi Slowed Reverb Bollywood Lofi,zara zara lofi,54,219437,0.608,0.638,11,...,0.0172,0.448,0.439,140.109,4,classical,219491,1.0,0.0,0.0
3,3,3YRj4jmwois2ctPnhwSwFo,Bombay Jayashri,Minnalae,vaseegara,68,299146,0.695,0.293,11,...,0.0158,0.132,0.637,143.804,4,classical,299214,1.0,0.0,0.0
4,4,3tp3ij9dtY3CacQgd1OvRf,Bombay Jayashri;Swattrex,Hindi LoFi Vibe,zara zara lofi chill,59,387716,0.583,0.308,7,...,0.0106,0.257,0.241,118.226,4,classical,387775,1.0,0.0,0.0


In [None]:
class PartialStandardScaler(Transformer):
    """partial because only some columns can be selected for standardiation."""    

    def __init__(
            self,
            columns: list[str],
            *,
            copy: bool = True,
            with_mean: bool = True,
            with_std: bool = True
        ):
        self.columns = columns
        self.standardizer = StandardScaler(
            copy=copy,
            with_mean=with_mean,
            with_std=with_std,
        )

    def fit(self, X, y=None):

        self.standardizer.fit(X[self.columns])

        return self

    
    def transform(self, X):
        
        X_standardized_np = self.standardizer.transform(X[self.columns])

        X_standardized = pd.DataFrame(X_standardized_np, columns=self.standardizer.get_feature_names_out())

        X = pd.concat([df.drop(self.columns, axis=1), X_standardized], axis=1)

        print(f">> (INFO - PartialStandardScaler) columns {self.columns} have bean standardized")


        return X
    
### TEST ###

transfo = PartialStandardScaler(columns=["energy"])
df_test = transfo.fit_transform(df)
df_test.head()

>> (INFO - PartialStandardScaler) columns ['energy'] have bean standardized


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,key,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,duration_ms_+_popularity,energy
0,0,7wrYBASu0OoxoDErd4Edxd,Bombay Jayashri,Rehnaa Hai Terre Dil Mein,zara zara,58,298266,False,0.643,11,...,0.09,0.593,2e-06,0.316,0.62,143.813,4,classical,298324,-1.087423
1,1,72HdutlIHBZJ7WT1xVAAZT,Shankar;Ehsaan;Loy;Alisha Chinai;Shankar Mahad...,,kajra,59,482586,False,0.484,0,...,0.164,0.365,0.0,0.091,0.68,91.975,4,classical,482645,1.123097
2,2,7JGgKHHDgJCJkQCQxyHHdl,Bombay Jayashri;DJ Aftab,Hindi Slowed Reverb Bollywood Lofi,zara zara lofi,54,219437,False,0.608,11,...,0.0292,0.581,0.0172,0.448,0.439,140.109,4,classical,219491,0.210819
3,3,3YRj4jmwois2ctPnhwSwFo,Bombay Jayashri,Minnalae,vaseegara,68,299146,False,0.695,11,...,0.0431,0.596,0.0158,0.132,0.637,143.804,4,classical,299214,-0.999703
4,4,3tp3ij9dtY3CacQgd1OvRf,Bombay Jayashri;Swattrex,Hindi LoFi Vibe,zara zara lofi chill,59,387716,False,0.583,7,...,0.0465,0.581,0.0106,0.257,0.241,118.226,4,classical,387775,-0.947072


In [13]:
class DropCols(Transformer):
    def __init__(self, columns: list[str]):
        self.columns = columns
        pass

    def fit(self, X, y=None):
        return self

    
    def transform(self, X):
        
        X = X.drop(columns=self.columns)

        print(f">> (INFO - DropCols) columns {self.columns} is/are droped.")

        return X
    
transfo = DropCols(columns=["explicit"])
df_test = transfo.fit_transform(df)
df_test.head()

>> (INFO - DropCols) columns ['explicit'] is/are droped.


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,7wrYBASu0OoxoDErd4Edxd,Bombay Jayashri,Rehnaa Hai Terre Dil Mein,zara zara,58,298266,0.643,0.268,11,-15.073,0,0.09,0.593,2e-06,0.316,0.62,143.813,4,classical
1,1,72HdutlIHBZJ7WT1xVAAZT,Shankar;Ehsaan;Loy;Alisha Chinai;Shankar Mahad...,,kajra,59,482586,0.484,0.898,0,-4.132,1,0.164,0.365,0.0,0.091,0.68,91.975,4,classical
2,2,7JGgKHHDgJCJkQCQxyHHdl,Bombay Jayashri;DJ Aftab,Hindi Slowed Reverb Bollywood Lofi,zara zara lofi,54,219437,0.608,0.638,11,-6.008,0,0.0292,0.581,0.0172,0.448,0.439,140.109,4,classical
3,3,3YRj4jmwois2ctPnhwSwFo,Bombay Jayashri,Minnalae,vaseegara,68,299146,0.695,0.293,11,-16.278,0,0.0431,0.596,0.0158,0.132,0.637,143.804,4,classical
4,4,3tp3ij9dtY3CacQgd1OvRf,Bombay Jayashri;Swattrex,Hindi LoFi Vibe,zara zara lofi chill,59,387716,0.583,0.308,7,-18.303,0,0.0465,0.581,0.0106,0.257,0.241,118.226,4,classical


## Pipeline for processing

In [None]:
path_src_dataset = Path("./data/cleaned/TODELETE.csv")
out_folder_dataset = Path("./data/processed")
out_folder_config = Path("./data/processed/pipelines")

df = pd.read_csv(path_src_dataset)

### Create a new pipeline

Split Continuous / Categorical features

In [18]:
categorical_features = df.select_dtypes(include=["object"]).columns.to_list()
num_categorical_features = ["key","mode","time_signature"] # Add numerical data but with a categorical meaning (ex: color of car  => red=0, blue=1, green=2)
categorical_features.extend(num_categorical_features)

numerical_features = df.drop(columns=categorical_features).columns.to_list()

In [None]:
pipeline = Pipeline([
    ('SumCols', SumCols(columns=["duration_ms", "popularity"], weights=[0, 2], remove_cols_in=True)),
    ('DropCols', DropCols(["key"])),
    ('OneHotEncoder', PartialOneHotEncoder(columns=["explicit"])),
    ('PartialStandardScaler', PartialStandardScaler(columns=["energy"])),
    # ... Add other Transformers
    ('FeatureSelection', SelectFromModel(LinearSVC(penalty="l1"), max_features=30)) # Feature selection
])

df_processed = pipeline.fit_transform(df)
df_processed.head()

>> (INFO - SumCols) columns ['duration_ms', 'popularity'] has been sumed in a new column : duration_ms_+_popularity
>> (INFO - DropCols) columns ['key'] is/are droped.
>> (INFO - PartialOneHotEncoder) ['explicit'] features one hot encoded as : ['explicit_False' 'explicit_True' 'explicit_nan']
>> (INFO - PartialStandardScaler) columns ['energy'] have bean standardized


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,key,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,duration_ms_+_popularity,energy
0,0,7wrYBASu0OoxoDErd4Edxd,Bombay Jayashri,Rehnaa Hai Terre Dil Mein,zara zara,58,298266,False,0.643,11,...,0.09,0.593,2e-06,0.316,0.62,143.813,4,classical,298324,-1.087423
1,1,72HdutlIHBZJ7WT1xVAAZT,Shankar;Ehsaan;Loy;Alisha Chinai;Shankar Mahad...,,kajra,59,482586,False,0.484,0,...,0.164,0.365,0.0,0.091,0.68,91.975,4,classical,482645,1.123097
2,2,7JGgKHHDgJCJkQCQxyHHdl,Bombay Jayashri;DJ Aftab,Hindi Slowed Reverb Bollywood Lofi,zara zara lofi,54,219437,False,0.608,11,...,0.0292,0.581,0.0172,0.448,0.439,140.109,4,classical,219491,0.210819
3,3,3YRj4jmwois2ctPnhwSwFo,Bombay Jayashri,Minnalae,vaseegara,68,299146,False,0.695,11,...,0.0431,0.596,0.0158,0.132,0.637,143.804,4,classical,299214,-0.999703
4,4,3tp3ij9dtY3CacQgd1OvRf,Bombay Jayashri;Swattrex,Hindi LoFi Vibe,zara zara lofi chill,59,387716,False,0.583,7,...,0.0465,0.581,0.0106,0.257,0.241,118.226,4,classical,387775,-0.947072


### Load an existing pipeline

In [None]:
pipeline_name = "TODELETE"

with open(out_folder_config / Path(pipeline_name + ".pkl"), 'rb') as file:
    pipeline: Pipeline = pickle.load(file)


df_processed = pipeline.fit_transform(df)
# df_processed.head()

>> (Info) Droped columns : []
>> (Info) Punctuation and stopwords removed from columns ['track_name']


### Save Processed Dataset + Pipeline

In [None]:
df_processed_name = "TODELETE"

df_processed.to_csv(out_folder_dataset / Path(df_processed_name + ".csv"))

# Writing to sample.json
with open(out_folder_config / Path(df_processed_name + ".pkl"), "wb") as file:
    pickle.dump(pipeline, file)