# Random Forests and Decision Trees
- Classify the Spotify.csv dataset 
- Predict the Genre 
- The dataset has 23 Columns and The Output feature is **playlist_genre** so all others can be used as input features. Decide on what can be used and what cannot be used. 
- Do the Preprocessing below
- Do 70:30 Train test split 

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


df = pd.read_csv('spotify.csv')
X = df.drop(columns=['playlist_genre'])
y = df['playlist_genre']
columns_to_drop = [
    'track_album_id', 'track_album_name', 'track_album_release_date',
    'playlist_name', 'playlist_id', 'track_name', 'track_artist','track_popularity']
X= X.drop(columns=columns_to_drop)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())
#numeric or not playlist_subgenre    False
#danceability         False
#energy               False
#key                  False
#loudness              True
#mode                  True
#speechiness           True
#acousticness          True
#insttrack_id          True
#rumentalness          True
#liveness              True
#valence               True
#tempo                 True
#duration_ms           True
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X1_train = X_train.loc[:,["loudness","mode","speechiness","acousticness","insttrack_id","rumentalness","liveness","valence","tempo","duration_ms"]]
scaler = StandardScaler()
X1_train = pd.DataFrame(scaler.fit_transform(X1_train), columns=X1_train.columns)
X_train = X_train.drop(["loudness","mode","speechiness","acousticness","insttrack_id","rumentalness","liveness","valence","tempo","duration_ms"],axis=1)
X2_train = pd.concat([X_train,X1_train],axis=1)
for col in ['playlist_subgenre', 'key','danceability', 'energy']:
    X2_train[col] = label_encoder.fit_transform(X2_train[col])
X2_train

Unnamed: 0,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,insttrack_id,rumentalness,liveness,valence,tempo,duration_ms
26210,325,136,3,13,,,,,,,,,,
14615,234,72,5,15,0.778562,-0.247959,-1.213971,1.354641,-1.136007,-0.374288,-0.536933,1.141738,1.234829,-1.343402
21824,361,5,3,23,0.599293,0.253453,1.277014,0.147626,-1.136007,-0.374288,0.198889,-0.112104,0.523537,1.263229
2781,368,147,2,5,-1.786375,0.881596,-1.213971,0.871835,-1.136007,-0.374288,-0.549956,-1.297243,1.970826,-0.012461
19283,314,236,1,19,1.006097,-0.815492,0.723462,-0.140577,0.880276,-0.373703,-0.458793,0.635048,-0.848005,-0.669166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22969,448,470,6,24,0.013218,1.526269,-0.106866,0.658040,-1.136007,0.061670,1.025874,-1.580645,0.261240,-0.939874
22976,448,470,6,24,-0.607331,-0.055108,1.000238,-1.280928,0.880276,1.542306,-0.445769,0.489053,-0.777821,0.692932
22978,448,470,6,24,0.406233,0.666705,-1.490747,1.314239,0.880276,-0.374288,1.637973,0.042479,-0.777228,-0.920249
22981,448,470,6,24,-0.455641,-0.578561,0.446686,-0.389050,-1.136007,-0.374288,-0.833866,-0.700379,0.750520,-0.200349


## Frame a Decision Tree Model 
- Use Any Parameters
- You can use any library

In [98]:
import random
import pandas as pd
import numpy as np
from collections import Counter

# Define a Decision Tree class
class DecisionTree:
    def __init__(self, max_depth, min_samples_split):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = {}

    def entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = sum(probabilities * -np.log2(probabilities))
        return entropy

    def information_gain(self, X_column, y, threshold):
        left_idxs, right_idxs = self.split(X_column, threshold)
        left_y = y[left_idxs]
        right_y = y[right_idxs]
        child_entropy = self.entropy(left_y) + self.entropy(right_y)
        ig = self.entropy(y) - child_entropy
        return ig

    def split(self, X_column, threshold):
        left_idxs = np.argwhere(X_column <= threshold).flatten()
        right_idxs = np.argwhere(X_column > threshold).flatten()
        return left_idxs, right_idxs

    def most_common_label(self, y):
        counter = counter(y)
        most_common = counter.most_common(1)
        return most_common[0][0]

    def find_split_point(self, X, y):
        
        best_gain = -1
        best_column = None
        best_threshold = None

        n_samples, n_features = X.shape

        for column in range(1,n_features):
            
            X_column = X.iloc[:, column]
            thresholds = np.unique(X_column)
                
        for threshold in thresholds:
            ig = self.information_gain(X_column, y, threshold)

            if ig > best_gain:
                best_gain = ig
                best_column = column
                best_threshold = threshold

        return best_column, best_threshold

    def build_tree(self, X, y, depth=0):
        if depth == self.max_depth or len(y) < self.min_samples_split:
            self.tree['label'] = self.most_common_label(y)
            return
        column, threshold = self.find_split_point(X, y)
        self.tree['feature_idx'] = column
        self.tree['split_point'] = threshold
        left_idxs, right_idxs = self.split(X[:, column], threshold)
        self.tree['left_split'] = {}
        self.tree['right_split'] = {}
        self.build_tree(X[left_idxs], y[left_idxs], depth=depth+1)
        self.build_tree(X[right_idxs], y[right_idxs], depth=depth+1)

    def predict_tree(self, X_test):
        feature_idx = self.tree['feature_idx']
        if X_test[feature_idx] <= self.tree['split_point']:
            if 'left_split' in self.tree:
                return self.predict_tree(self.tree['left_split'], X_test)
            else:
                return self.tree['label']
        else:
            if 'right_split' in self.tree:
                return self.predict_tree(self.tree['right_split'], X_test)
            else:
                return self.tree['label']

## Frame a Random Forests Model 
- Use Any Parameters
- You can use any library

In [99]:
# Define a Random Forest class
class RandomForest:
    def __init__(self, n_trees, max_depth, min_samples_split, max_features):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []

    def build_trees(self, X, y):
        for i in range(self.n_trees):
            tree = DecisionTree(self.max_depth, self.min_samples_split)
            column_idxs = np.random.choice(X.shape[1], self.max_features, replace=False)
            print(column_idxs)
            X_column_subset = X.iloc[:, column_idxs]
            tree.build_tree(X_column_subset, y)
            self.trees.append(tree)

    def predict_rf(self, X_test):
        ensemble_preds = [tree.predict_tree(X_test) for tree in self.trees]
        final_pred = max(ensemble_preds)

## Train the Models and Report the Accuracy

In [100]:
import math
rf = RandomForest(n_trees =100,max_features=3,max_depth=5,min_samples_split=2)
rf.build_trees(X2_train,y_train)
X1_test = X_test.loc[:,["loudness","mode","speechiness","acousticness","insttrack_id","rumentalness","liveness","valence","tempo","duration_ms"]]
X1_test = pd.DataFrame(scaler.transform(X1_test), columns=X1_test.columns)
X_test = X_test.drop(["loudness","mode","speechiness","acousticness","insttrack_id","rumentalness","liveness","valence","tempo","duration_ms"],axis=1)
X2_test = pd.concat([X_test,X1_test],axis=1)
for col in ['playlist_subgenre', 'key','danceability', 'energy']:
    X2_test[col] = label_encoder.transform(X2_train[col])
X2_test

[13  9  5]


IndexError: index 22983 is out of bounds for axis 0 with size 22983

In [57]:
X2_train.shape[1]

14

In [58]:
14**0.5

3.7416573867739413