In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('./music_genre.csv')

In [None]:
# Clean table of instance_id, artist_name, track_name, key, mode, and obtained_date
df = df.drop(['instance_id', 'artist_name', 'track_name', 'key', 'mode', 'obtained_date'], axis=1)

# Clean table of songs with negative duration
negative_lengths = df[df['duration_ms'] < 0].index
df.drop(negative_lengths, inplace=True)

# Clean table of songs with unknown tempo
unknown_tempo = df[df['tempo'] == '?'].index
df.drop(unknown_tempo, inplace=True)

# Clean table of songs with NaN values
df = df.dropna(subset=['popularity', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'music_genre'])

# Round tempos
def df_value_round(x):
    x = float(x)
    return round(x)

df['tempo'] = df['tempo'].apply(df_value_round)

# convert duration_ms to durastion_secs
def ms_to_s(x):
    x = float(x)
    return x / 1000

df['duration_ms'] = df['duration_ms'].apply(ms_to_s)
df = df.rename(columns={'duration_ms': 'duration_secs'})

In [None]:
# histograms of each genre
df.hist(by=df['music_genre'], figsize = (20,20));

In [None]:
# split data into test and train
train_df, test_df = train_test_split(df, test_size=.25, shuffle=True)

In [None]:
# histograms of training data
histograms = train_df.hist(figsize=(20, 20))

In [None]:
train_features = df.drop(['music_genre'], axis=1)
# Descriptive Statistics
print("*****************************************************************")
for col in train_features:
    print("{} mean = {}".format(col, np.mean(train_features[col])))
    print("{} median = {}".format(col, np.median(train_features[col])))
    print("{} standard deviation = {}".format(col, np.std(train_features[col])))
    print("*****************************************************************")

In [None]:
# KNN HERE

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [None]:
model = GaussianNB()
training_features = train_df.drop(['music_genre'], axis=1)
training_labels = train_df['music_genre']
model.fit(training_features, training_labels)
predicted = model.predict(training_features)

print('training set:')
print(metrics.classification_report(training_labels, predicted))
print(metrics.confusion_matrix(training_labels, predicted))

print('testing set:')
testing_features = test_df.drop(['music_genre'], axis=1)
testing_labels = test_df['music_genre']
predicted = model.predict(testing_features)
print(metrics.classification_report(testing_labels, predicted))
print(metrics.confusion_matrix(testing_labels, predicted))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
forest_model = RandomForestClassifier()

training_features = train_df.drop(['music_genre'], axis=1)
training_labels = train_df['music_genre']
forest_model.fit(training_features, training_labels)

forest_predicted = forest_model.predict(training_features)

print('training set:')
print(metrics.classification_report(training_labels, forest_predicted))
print(metrics.confusion_matrix(training_labels, forest_predicted))

print('testing set:')
testing_features = test_df.drop(['music_genre'], axis=1)
testing_labels = test_df['music_genre']

predicted = forest_model.predict(testing_features)
print(metrics.classification_report(testing_labels, predicted))
print(metrics.confusion_matrix(testing_labels, predicted))