In [6]:
import sys
import os
from dataclasses import dataclass

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.utils import resample

In [7]:
class DataTranformation:
    def __init__(self):
        pass
    
    def initiate_data_tranformation(self, raw_path):
        data=pd.read_csv(raw_path)

        data = data.dropna()

        label_encoder = LabelEncoder()
        data['track_genre_encoded'] = label_encoder.fit_transform(data['track_genre'])

        df_majority = data[data['explicit'] == False]
        df_minority = data[data['explicit'] == True]

        df_minority_upsampled = resample(df_minority, 
                                        replace=True,
                                        n_samples=len(df_majority),
                                        random_state=42)

        df_balanced = pd.concat([df_majority, df_minority_upsampled])

        data_resampled_time_signature = df_balanced.groupby('time_signature').apply(
            lambda x: x.sample(df_balanced['time_signature'].value_counts().max(), replace=True)).reset_index(drop=True)


        data_resampled = data_resampled_time_signature.drop(['Unnamed: 0','track_id','album_name','track_name','track_genre', 'artists'],axis =1)

        scaler = StandardScaler()
        continuous_features = ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 
                    'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
        data_resampled[continuous_features] = scaler.fit_transform(data_resampled[continuous_features])

        data_resampled = pd.get_dummies(data_resampled, columns=['key', 'mode', 'time_signature'], drop_first=True)



        return data_resampled

        
    def transform_input_data(self, dataframe, raw_path):

        data=pd.read_csv(raw_path)


        data = data.dropna()

        label_encoder = LabelEncoder()
        data['track_genre_encoded'] = label_encoder.fit_transform(data['track_genre'])

        df_majority = data[data['explicit'] == False]
        df_minority = data[data['explicit'] == True]

        df_minority_upsampled = resample(df_minority, 
                                        replace=True,
                                        n_samples=len(df_majority),
                                        random_state=42)

        df_balanced = pd.concat([df_majority, df_minority_upsampled])

        data_resampled_time_signature = df_balanced.groupby('time_signature').apply(
            lambda x: x.sample(df_balanced['time_signature'].value_counts().max(), replace=True)).reset_index(drop=True)


        data_resampled = data_resampled_time_signature.drop(['Unnamed: 0','track_id','album_name','track_name','track_genre', 'artists'],axis =1)

        scaler = StandardScaler()
        continuous_features = ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 
                    'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
        scaler.fit(data_resampled[continuous_features])
        print(dataframe)
        dataframe = scaler.transform(dataframe[continuous_features])
        
        print(dataframe)


        return dataframe



In [44]:
from sklearn.model_selection import train_test_split
def get_feature_columns():
    raw_data = '/Users/colemak/Documents/spotifymodel/artifact/data.csv'
    data_transformation = DataTranformation()
    data = data_transformation.initiate_data_tranformation(raw_data)
    X = data.drop('track_genre_encoded', axis=1) 
    y = data['track_genre_encoded']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model_features = X_train.columns

    return model_features


In [101]:
class DataTranformation:
    def __init__(self):
        pass
    
    def initiate_data_tranformation(self, raw_path):
        data=pd.read_csv(raw_path)

        data = data.dropna()

        label_encoder = LabelEncoder()
        data['track_genre_encoded'] = label_encoder.fit_transform(data['track_genre'])

        df_majority = data[data['explicit'] == False]
        df_minority = data[data['explicit'] == True]

        df_minority_upsampled = resample(df_minority, 
                                        replace=True,
                                        n_samples=len(df_majority),
                                        random_state=42)

        df_balanced = pd.concat([df_majority, df_minority_upsampled])

        data_resampled_time_signature = df_balanced.groupby('time_signature').apply(
            lambda x: x.sample(df_balanced['time_signature'].value_counts().max(), replace=True)).reset_index(drop=True)

        data_resampled = data_resampled_time_signature.drop(['Unnamed: 0','track_id','album_name','track_name','track_genre', 'artists'],axis =1)

        scaler = StandardScaler()
        continuous_features = ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 
                    'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
        data_resampled[continuous_features] = scaler.fit_transform(data_resampled[continuous_features])

        data_resampled = pd.get_dummies(data_resampled, columns=['key', 'mode', 'time_signature'], drop_first=True)


        return data_resampled
        
    def transform_input_data(self, dataframe, raw_path):
        data=pd.read_csv(raw_path)

        data = data.dropna()

        label_encoder = LabelEncoder()
        data['track_genre_encoded'] = label_encoder.fit_transform(data['track_genre'])

        df_majority = data[data['explicit'] == False]
        df_minority = data[data['explicit'] == True]

        df_minority_upsampled = resample(df_minority, 
                                        replace=True,
                                        n_samples=len(df_majority),
                                        random_state=42)

        df_balanced = pd.concat([df_majority, df_minority_upsampled])

        data_resampled_time_signature = df_balanced.groupby('time_signature').apply(
            lambda x: x.sample(df_balanced['time_signature'].value_counts().max(), replace=True)).reset_index(drop=True)

        data_resampled = data_resampled_time_signature.drop(['Unnamed: 0','track_id','album_name','track_name','track_genre', 'artists'],axis =1)

        scaler = StandardScaler()
        continuous_features = ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 
                    'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
        cat_features = ['mode','key','time_signature', 'explicit']
        scaler.fit(data_resampled[continuous_features])
        cat_dataframe = dataframe[cat_features]
        dataframe = scaler.transform(dataframe[continuous_features])
        dataframe = pd.DataFrame(dataframe, columns=continuous_features)
        dataframe = pd.concat([dataframe, cat_dataframe.reset_index(drop=True)], axis=1)
        dataframe = dataframe[[
                'popularity', 
                'duration_ms', 
                'explicit',
                'danceability', 
                'energy',
                'key', 
                'loudness', 
                'mode',
                'speechiness',
                'acousticness',
                'instrumentalness',
                'liveness',
                'valence',
                'tempo',
                'time_signature'
                ]]
        feature_columns = get_feature_columns()
        print(dataframe)
        categorical_features = ['key','mode','time_signature']
        data_dummy = pd.get_dummies(dataframe[categorical_features], drop_first=False)
        training_columns = ['key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
                    'key_11', 'mode_1', 'time_signature_1', 'time_signature_3', 'time_signature_4', 'time_signature_5']
        print(data_dummy)
        data_dummy = data_dummy.reindex(columns=training_columns, fill_value=False)
        dataframe = dataframe.drop(columns=training_columns)
        dataframe = pd.concat([dataframe, data_dummy], axis=1)
        print(dataframe)
        # dataframe = pd.get_dummies(dataframe, columns=['key', 'mode', 'time_signature'], drop_first=True)
        # print(dataframe)

        return dataframe


In [102]:
song_df = pd.read_csv('/Users/colemak/Documents/spotifymodel/artifact/test.csv')
data = pd.read_csv('/Users/colemak/Documents/spotifymodel/artifact/data.csv')

In [64]:
song_df

Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0,58,179147,False,0.674,0.932,2,-2.921,1,0.0417,0.137,1.1e-05,0.0934,0.469,130.986,4


In [103]:
pd.set_option('display.max_columns', None)
data_transformation = DataTranformation()
song_df = data_transformation.transform_input_data(song_df, '/Users/colemak/Documents/spotifymodel/artifact/data.csv')
song_df

  data_resampled_time_signature = df_balanced.groupby('time_signature').apply(
  data_resampled_time_signature = df_balanced.groupby('time_signature').apply(


   popularity  duration_ms  explicit  danceability    energy  key  loudness  \
0    1.319693    -0.103709     False      0.927616  1.334808    2  1.044015   

   mode  speechiness  acousticness  instrumentalness  liveness   valence  \
0     1    -0.535115      -0.81509         -0.639112 -0.762512  0.492385   

      tempo  time_signature  
0  0.696437               4  
   key  mode  time_signature
0    2     1               4


KeyError: "['key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'mode_1', 'time_signature_1', 'time_signature_3', 'time_signature_4', 'time_signature_5'] not found in axis"