# Spotify Tracks Data Preprocessing Notebook

## Convert the Dataset -> Pandas

In [None]:
dataset_copy = dataset.copy()

In [None]:
df_spotify = pd.DataFrame(dataset_copy['train'])

In [None]:
def create_track_lookup(data):
    def value_encode(data_field):
        return data_field.apply(lambda field: float(sum(bytearray(str(field).encode('utf-8')))))

    track_lookup = data[['track_id', 'track_name']].drop_duplicates()
    track_lookup['track_id'] = value_encode(track_lookup['track_id'])
    
    return track_lookup

## Data Preprocessing

In [None]:
def data_preprocessing(data):
    def value_encode(data_field):
        return data_field.apply(lambda field: float(sum(bytearray(str(field).encode('utf-8')))))
    
    data_copy = data.dropna().copy()
    
    data_copy.drop('Unnamed: 0', inplace=True, axis=1)

    data_copy['track_id'] = value_encode(data_copy['track_id'])

    return data_copy

In [None]:
def build_pipeline():
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('quantile_transformer', QuantileTransformer(output_distribution='normal')),
        ('std_scaler', StandardScaler()),
    ])
    
    return pipeline

In [None]:
def train_test_split_processing(data: pd.DataFrame, target: str, test_size: float, shuffle: bool=True):
    categorical_columns = ['artists', 'album_name', 'track_name', 'track_genre']
    
    for col in categorical_columns:
        labelencoder = LabelEncoder()
        data[col] = labelencoder.fit_transform(data[col])
    
    feature_set = data.drop(target, axis=1)
    target_set = data[target]

    numerical_columns = list(set(feature_set.columns) - set(categorical_columns))
    
    full_pipeline = ColumnTransformer([
        ("num", build_pipeline(), numerical_columns),
    ])
    
    feature_set_prepared = full_pipeline.fit_transform(feature_set)
    feature_set_prepared = pd.DataFrame(feature_set_prepared, columns=numerical_columns)
    
    x_train, x_test, y_train, y_test = train_test_split(feature_set_prepared, target_set, test_size=test_size, shuffle=shuffle)
    
    return x_train, x_test, y_train, y_test

In [None]:
track_lookup = create_track_lookup(df_spotify)

In [None]:
data_train = data_preprocessing(df_spotify)