### Import required libraries

In [None]:

import pymysql
from cryptography.fernet import Fernet
import json
from collections import namedtuple
import time
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine 
from sklearn import model_selection
from sklearn.metrics import classification_report as cr
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from yellowbrick.classifier import ClassificationReport
from yellowbrick.features import FeatureImportances

%matplotlib inline

### Increase the size of the screen

In [None]:
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Decrypt Credentials and Connect to Database

In [None]:
#Uncode the secret license file
#Uncode the secret license file
def unencrypt():
    try:
        key = b'IXx5rHfP15FqP4ahx2pwcud-XmcBzU553Ri6p-nVhnc=' #Fernet.generate_key()
        cipher_suite = Fernet(key)
        with open('/usr/local/etc/musicmood_bytes.bin', 'rb') as file_object:
            for line in file_object:
                encryptedpwd = line
        uncipher_text = (cipher_suite.decrypt(encryptedpwd))
        plain_text_encryptedpassword = bytes(uncipher_text).decode("utf-8") #convert to string
        x = json.loads(plain_text_encryptedpassword, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))
        return x
    except Exception as e:
        print(str(e))
        return "Error" 

#Setup the database connection
credentials = unencrypt()
user_id = credentials.user
user_password = credentials.password
dbname = credentials.dbname
server = credentials.server
conn = pymysql.connect(server,user_id,user_password,dbname)

### Load the normalized table

In [None]:
start_time = time.time()
df = pd.read_sql('SELECT * FROM songs_instance_normalized', con = conn)
print('--- %s seconds ---' % (time.time() - start_time))
    
df.drop(['index'],axis=1, inplace=True)
df.head(15)

#df.describe()
#print(df.columns)

### Visualize only important features 

In [None]:
#songs = df[['decade', 'duration', 'key_song', 'loudness', 'mode', 'tempo', 'time_signature', \
#            'words_song_u','words_song', 'words_song_r', 'words_genre_u','words_genre','words_genre_r','words_second',\
#            'speed_general', 'artist_familiarity', 'artist_hotttnesss', \
#            'gnr_Metal','is_top40']]
songs = df [['duration', 'key_song', 'loudness', 'mode', 'tempo', 'time_signature', 'words_song_u', 'words_song', \
             'words_song_r', 'words_genre_u', 'words_genre', 'words_genre_r', 'words_second', 'speed_general', \
             'artist_familiarity', 'artist_hotttnesss', 'gnr_Blues', 'gnr_Country', 'gnr_Folk', 'gnr_Funk', \
             'gnr_House Electronic Trance', 'gnr_Jazz', 'gnr_Latin', 'gnr_Metal', 'gnr_Pop', 'gnr_Pop Standards', \
             'gnr_Punk', 'gnr_Rap Hip Hop', 'gnr_Rhythm and Blues', 'gnr_Rock', 'gnr_Rock and Roll', \
             'gnr_Ska Reggae Dancehall', 'is_top40']]

### Correlation Matrix Plot

In [None]:
import seaborn as sns
corr = songs.corr()
fig, ax = plt.subplots(figsize=(25,15)) 
ax=sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, annot=True, linewidths=.5, cmap="Greens")
plt.show()

### Scatterplot

In [None]:
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import matplotlib.pyplot as plt
g = sns.pairplot(songs, size=3, hue='is_top40', vars=['duration', 'key_song', 'loudness', 'mode', 'tempo', \
                                      'time_signature','is_top40'], kind="reg")

In [None]:
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import matplotlib.pyplot as plt
g = sns.pairplot(songs, size=3, hue='is_top40', vars=['words_song_u','words_song', 'words_song_r', 'words_genre_u','words_genre',\
                                      'words_genre_r','words_second','is_top40'], kind="reg")

In [None]:
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import matplotlib.pyplot as plt
g = sns.pairplot(songs,hue='is_top40', size=3, vars=['speed_general', 'artist_familiarity', 'artist_hotttnesss', \
            'gnr_Metal','is_top40'], kind="reg")

### Scatter Matrix

In [None]:
# Create a scatter matrix of the dataframe features
from pandas.plotting import scatter_matrix
scatter_matrix(songs, alpha=0.5, figsize=(22, 22), diagonal='kde')
plt.show()

### Kernel Density Estimation Plots

In [None]:
songs.plot(kind='kde', subplots=True, layout=(5,7), sharex=False, sharey=False, figsize=(25,25))
plt.show()

### Box Plots 

In [None]:
songs.plot(kind='box', subplots=True, layout=(5,7), sharex=False, sharey=False, figsize=(30,30))
plt.show()

### Parallel Coordinates

In [None]:
#from pandas.plotting import parallel_coordinates
#plt.figure(figsize=(20,5))
#dataset = df[['loudness','mode','duration','time_signature','tempo','key_song','is_top40']]
#parallel_coordinates(songs, 'is_top40',color=['r','b'], alpha=0.45)
#plt.show()
#songs.columns
from yellowbrick.features import ParallelCoordinates
# features = ['duration', 'key_song', 'loudness', 'mode', 'tempo', 'time_signature', 'words_song_u', \
#             'words_song', 'words_song_r', 'words_genre_u', 'words_genre', 'words_genre_r', 'words_second', 
#             'speed_general', 'artist_familiarity', 'artist_hotttnesss', 'gnr_Metal']
classes = ["Top40", "No Hit"]

features = [ 'gnr_Metal','duration', 'loudness', 'mode', 'tempo', 'words_song','gnr_Rock', 'words_song_r', 'words_genre_u', 'words_genre_r',\
            'words_second',  'gnr_Rock and Roll', 'artist_hotttnesss']
X = songs[features]
y = songs.is_top40
_, ax = plt.subplots(figsize=(20,8))
# Instantiate the visualizer
visualizer = ParallelCoordinates(
    classes=classes, features=features, normalize='minmax',sample=0.5, shuffle=True
)

# Fit and transform the data to the visualizer
visualizer.fit_transform(X, y)

# Finalize the title and axes then display the visualization
visualizer.poof()

### Rad Viz

In [None]:
# Specify the features of interest and the classes of the target
classes = ["Top40", "NoHit"]
# Extract the numpy arrays from the data frame
X = songs[features].as_matrix()
y = songs.is_top40.as_matrix()
# Import the visualizer
from yellowbrick.features import RadViz

# Instantiate the visualizer
_,ax = plt.subplots(figsize=(20,20))
visualizer = RadViz(classes=classes, features=features)

visualizer.fit(X, y)      # Fit the data to the visualizer
visualizer.transform(X)   # Transform the data
visualizer.poof()         # Draw/show/poof the data

# Initial Model Comparison

### Split-Out validation dataset

In [None]:
array = songs.values
X = array[:,0:-1]
Y = array[:,-1] #is_top40
    
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

  ### Test options and evaluation metric

In [None]:
seed = 7
# scoring = 'accuracy'
scoring = 'accuracy'

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import RadiusNeighborsClassifier

from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.svm import LinearSVC

from sklearn.ensemble import AdaBoostClassifier

### Selected Algorithms

In [None]:
models = []
# #Classification Models
# #Naive Bayes
# models.append(('NB', GaussianNB(priors=None)))
# models.append(('BNB', BernoulliNB(alpha=1.0)))
# #Decision Tree
# models.append(('CART',DecisionTreeClassifier(criterion = 'entropy', splitter='random', max_depth = 10)))
# models.append(('RFC', RandomForestClassifier(n_estimators=10,max_depth=10)))
# #Discriminant Analysis
# models.append(('LDA', LinearDiscriminantAnalysis()))
# models.append(('QDA', QuadraticDiscriminantAnalysis()))

# #LogisticRegression
# models.append(('LRG', LogisticRegression()))
# models.append(('LCV', LogisticRegressionCV()))

# #KNN
# models.append(('KNN', KNeighborsClassifier()))
# models.append(('NCT', NearestCentroid()))
# #models.append(('KNN', NearestNeighbors()))
# models.append(('RNC', RadiusNeighborsClassifier(radius=5.0, outlier_label=1)))
# #Support Vector Machine
# models.append(('SVM', SVC())) 
# models.append(('NuSVM', NuSVC())) 
# #kernel="linear", C=0.025)))
# models.append(('LinearSVC', LinearSVC())) #kernel="linear", C=0.025)))
              
# models.append(('ABC',AdaBoostClassifier()))


KNeighborsClassifier().get_params(deep = True)
n_neighbors = np.linspace(1,12,num=12, endpoint=True)
for a in range(len(n_neighbors)):
    models.append(('KNC'+ str(int(n_neighbors[a])), KNeighborsClassifier(algorithm = 'auto', metric = 'minkowski', n_jobs= -1, n_neighbors= int(n_neighbors[a]))))
                                                          


# import numpy as np
# from yellowbrick.model_selection import ValidationCurve

# pr = np.linspace(1, 5, num=5, endpoint=True)
# print(np.linspace(1, 5, num=5, endpoint=True))
# model = ValidationCurve(KNeighborsClassifier(), param_name="n_neighbors", param_range=pr)
# model.fit(X_train, Y_train)
# model.poof()


### Initial Evaluation Each Model in Turn

In [None]:
results = []
names = []
# model1=KNeighborsClassifier(n_neighbors=4)
# print(model1)
# cv_results = model_selection.cross_val_score(model1, X_train, Y_train, cv=3, scoring=scoring)
# msg = '%s: %f (%f)'% ('123: ', cv_results.mean(), cv_results.std())
# print(msg)

for name, model in models:
    start_time = time.time()
    kfold = model_selection.KFold(n_splits=12, random_state=1)
#     print (kfold)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)'% (name, cv_results.mean(), cv_results.std())
    print(msg)
    print('--- %s seconds ---' % (time.time() - start_time))

### Compare Algorithms

In [None]:
fig = plt.figure(figsize=(10, 8))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Applying the Visualization to the Models

In [None]:
from yellowbrick.classifier import ClassificationReport
classes = ["IsTop40", "NoHit"]
def plotting(name, model):
    model.fit(X_train, Y_train)
    predictions = model.predict(X_validation)
    print(name)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    visualizer = ClassificationReport(model, classes=classes, support=True)
    visualizer.fit(X_train, Y_train)  # Fit the visualizer and the model
    visualizer.score(X_validation, Y_validation)  # Evaluate the model on the test data
    g = visualizer.poof()             # Draw/show/poof the data
    
for name, model in models:
    plotting(name, model)

In [None]:
from sklearn.linear_model import Lasso
from yellowbrick.classifier import ClassPredictionError

def vizerror(name, model):
    # Instantiate the visualizer
    visualizer = ClassPredictionError(
        model, classes=classes)
    # Fit
    visualizer.fit(X_train, Y_train)
    # Score and visualize
    visualizer.score(X_validation, Y_validation)
    visualizer.poof()
    
for name, model in models:
    vizerror(name, model)