In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('./music_genre.csv')

In [None]:
# Clean table of instance_id, artist_name, track_name, key, mode, and obtained_date
df = df.drop(['instance_id', 'artist_name', 'track_name', 'key', 'mode', 'obtained_date', 'popularity'], axis=1)
df = df.reset_index(drop=True)

# Clean table of songs with negative duration
negative_lengths = df[df['duration_ms'] < 0].index
df.drop(negative_lengths, inplace=True)
df = df.reset_index(drop=True)

# Clean table of songs with unknown tempo
unknown_tempo = df[df['tempo'] == '?'].index
df.drop(unknown_tempo, inplace=True)
df = df.reset_index(drop=True)

# Clean table of songs with NaN values
df = df.dropna()
df = df.reset_index(drop=True)

# Round tempos
def df_value_round(x):
    x = float(x)
    return round(x)

df['tempo'] = df['tempo'].apply(df_value_round)

# convert duration_ms to durastion_secs
def ms_to_s(x):
    x = float(x)
    return x / 1000

df['duration_ms'] = df['duration_ms'].apply(ms_to_s)
df = df.rename(columns={'duration_ms': 'duration_secs'})

results = df['music_genre']

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
normalize = MinMaxScaler()
normalize.fit(df.drop('music_genre', axis=1))
normalized = normalize.fit_transform(df.drop('music_genre', axis=1))
normalized_df = pd.DataFrame(normalized, columns=(df.drop('music_genre', axis=1).columns))
df = normalized_df

In [None]:
df['music_genre'] = results
df

In [None]:
# histograms of each genre
training_features = df.drop(['music_genre'], axis=1)
for col in training_features:
    hist = df.hist(column=col, by=df['music_genre'], figsize=(10,10), legend=True)

In [None]:
# split data into test and train
train_df, test_df = train_test_split(df, test_size=.25, shuffle=True)

In [None]:
# histograms of training data
histograms = train_df.hist(figsize=(20, 20))

In [None]:
# Descriptive Stats - Mean
df.groupby(['music_genre']).mean()

In [None]:
# Descriptive Stats - Median
df.groupby(['music_genre']).median()

In [None]:
# Descriptive Stats - Standard Deviation
df.groupby(['music_genre']).median()

In [None]:
# Boxplots
training_features = train_df.drop(['music_genre'], axis=1)
for col in training_features:
    boxplot = df.boxplot(column=[col], by=['music_genre'], figsize=(8,8), showfliers=False, showmeans=True)

In [None]:
# Correlation Matrix
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True, cmap=plt.cm.Blues)
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [None]:
knn_model = KNeighborsClassifier(5)

training_features = train_df.drop(['music_genre'], axis=1)
training_labels = train_df['music_genre']
testing_features = test_df.drop(['music_genre'], axis=1)
testing_labels = test_df['music_genre']
knn_model.fit(training_features, training_labels)

predicted = knn_model.predict(training_features)

print('training set when k = 5:')
print(metrics.classification_report(training_labels,predicted))
print(metrics.confusion_matrix(training_labels, predicted))

predicted = knn_model.predict(testing_features)
print('testing set when k = 5:')
print(metrics.classification_report(testing_labels, predicted))
print(metrics.confusion_matrix(testing_labels, predicted))




In [None]:
knn_model = KNeighborsClassifier(25)

training_features = train_df.drop(['music_genre'], axis=1)
training_labels = train_df['music_genre']
testing_features = test_df.drop(['music_genre'], axis=1)
testing_labels = test_df['music_genre']
knn_model.fit(training_features, training_labels)

predicted = knn_model.predict(training_features)

print('training set when k = 25:')
print(metrics.classification_report(training_labels, predicted))
print(metrics.confusion_matrix(training_labels, predicted))

predicted = knn_model.predict(testing_features)
print('testing set when k = 25:')
print(metrics.classification_report(testing_labels, predicted))
print(metrics.confusion_matrix(testing_labels, predicted))


In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
model = GaussianNB()
training_features = train_df.drop(['music_genre'], axis=1)
training_labels = train_df['music_genre']
model.fit(training_features, training_labels)
predicted = model.predict(training_features)

print('training set:')
print(metrics.classification_report(training_labels, predicted))
print(metrics.confusion_matrix(training_labels, predicted))

print('testing set:')
testing_features = test_df.drop(['music_genre'], axis=1)
testing_labels = test_df['music_genre']
predicted = model.predict(testing_features)
print(metrics.classification_report(testing_labels, predicted))
print(metrics.confusion_matrix(testing_labels, predicted))

In [None]:
from sklearn.neighbors import NearestCentroid

In [None]:
centroid_model = NearestCentroid()
training_features = train_df.drop(['music_genre'], axis=1)
training_labels = train_df['music_genre']
testing_features = test_df.drop(['music_genre'], axis=1)
testing_labels = test_df['music_genre']
centroid_model.fit(training_features, training_labels)

predicted = centroid_model.predict(training_features)
print('training set:')
print(metrics.classification_report(training_labels, predicted))
print(metrics.confusion_matrix(training_labels, predicted))

predicted = centroid_model.predict(testing_features)
print('testing set:')
print(metrics.classification_report(testing_labels, predicted))
print(metrics.confusion_matrix(testing_labels, predicted))