In [1]:
import mlflow

mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('spotify_predictions')

<Experiment: artifact_location='/Users/davidgillespie/projects/MLops_project/mlruns/1', creation_time=1720533340194, experiment_id='1', last_update_time=1720533340194, lifecycle_stage='active', name='spotify_predictions', tags={}>

In [2]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
def load_data(filename):
    
    return pd.read_csv(filename)

In [4]:
def encode_genre(df):
    label_encoder = LabelEncoder()
    label_encoder.fit(df['genre'])
    df['genre_label'] = label_encoder.fit_transform(df['genre'])
    
    return df, label_encoder

In [5]:
def get_data_from_year(df:pd.DataFrame, year=None):
    #function to get data from a certain year
    if year is not None:
        return df[df['year']==year]
    return df

In [6]:
def get_features(df:pd.DataFrame, features = ['genre_label', 'danceability', 'loudness', 'speechiness', 'tempo']):
    
    return df[features]

In [7]:
def create_features(df:pd.DataFrame):
    #drop frames with the popularity score of 0
    df = df[df['popularity'] !=0]
    #create the duration in mins
    df['duration'] = df['duration_ms'].values /1000 / 60
    #encode the genre
    df, label_encoder = encode_genre(df)
    return df, label_encoder

In [8]:
def create_score(df:pd.DataFrame, thresh = 50):
    score = []
    for v in df['popularity']:
        if v >=thresh:
            score.append('popular')
        else:
            score.append('low')
    df['score'] = score
    return df

In [9]:
def create_dataset(filename:str, target = 'score'):
    df = load_data(filename)
    df, label_encoder  = create_features(df)
    df = create_score(df)
    X = get_features(df)
    y = df[target]
    return X,y,label_encoder

In [10]:
X,y,label_encoder =create_dataset('data/spotify_data.csv')
scaler = StandardScaler().fit(X)
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['duration'] = df['duration_ms'].values /1000 / 60
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['genre_label'] = label_encoder.fit_transform(df['genre'])


In [11]:
len(X_train), len(X_test)

(751029, 250344)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

In [13]:
with mlflow.start_run(run_name='Random Forest') as run:
    mlflow.log_param('training data', 'data/spotify_data.csv')
    
    mlflow.sklearn.autolog()
    rf = RandomForestClassifier(n_jobs = 8, n_estimators = 100)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    

    mlflow.log_metric('f1_score_low_class', f1_score(y_test, y_pred, pos_label='low'))
    mlflow.log_metric('precision_low_class', precision_score(y_test, y_pred, pos_label='low'))
    mlflow.log_metric('recall_low_class', recall_score(y_test, y_pred, pos_label='low'))
                      
    mlflow.log_metric('f1_score_popular_class', f1_score(y_test, y_pred, pos_label='popular'))
    mlflow.log_metric('precision_popular_class', precision_score(y_test, y_pred, pos_label='popular'))
    mlflow.log_metric('recall_popular_class', recall_score(y_test, y_pred, pos_label='popular'))
                                     

