-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier_functions.py
168 lines (141 loc) · 5.94 KB
/
classifier_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import roc_curve, auc, plot_roc_curve
def clf_fit(clfs, names, xtrain, xtest, ytrain, ytest):
'''
Fits a list of classifiers and returns a dataframe of accuracy scores.
Parameters:
clfs - A list of classifiers.
names - A list of the names of the classifiers.
xtrain - A dataframe of train data.
xtest - A dataframe of test data.
ytrain - An array of label terms for train data.
ytest - An array of label terms for test data.
Returns:
df - A dataframe of each classifier and its train and test accuracy
'''
scores = []
for i, clf in enumerate(clfs):
name = names[i]
clf.fit(xtrain, ytrain)
train_score = clf.score(xtrain, ytrain)
test_score = clf.score(xtest, ytest)
scores.append((name, train_score, test_score))
df = pd.DataFrame(scores, columns=['classifier', 'train_score', 'test_score'])
return df
def grid_fit(clf, params, xtrain, ytrain):
'''
Fits a GridSearchCV to a classifier and prints the best parameters and best accuracy.
Parameters:
clf - A classifier instance.
params - A dictionary containing the parameters to be changed.
xtrain - A dataframe of train data.
ytrain - An array of label terms for train data.
Returns:
gridsearch - A fit GridSearchCV instance.
'''
gridsearch = GridSearchCV(clf, params, scoring='accuracy', cv=3, verbose=0)
gridsearch.fit(xtrain, ytrain)
print(f"Best params: {gridsearch.best_params_}")
print(f"Best score: {gridsearch.best_score_}")
return gridsearch
def classifier_results(clf, xtrain, xtest, ytrain, ytest, train_preds, test_preds):
'''
Prints diagnostics for a classifier.
Parameters:
clf - A classifier.
xtrain - A dataframe of training data.
xtest - A dataframe of testing data.
ytrain - An array of label terms for training data.
ytest - An array of label terms for testing data.
train_preds - Predictions for the training data.
test_preds - Predictions for the testing data.
Returns:
auc_train - Area under the ROC curve for train data.
auc-test - Area under the ROC curve for test data.
'''
print('Classification Report - Training')
print(classification_report(ytrain, train_preds))
print('Classification Report - Testing')
print(classification_report(ytest, test_preds))
fpr, tpr, thresholds = roc_curve(train_preds, ytrain)
auc_train = round(auc(fpr, tpr),3)
fpr, tpr, thresholds = roc_curve(test_preds, ytest)
auc_test = round(auc(fpr, tpr),3)
print(f'Training AUC: {auc_train}, Testing AUC: {auc_test}')
print('Confusion Matrix - Training')
plot_confusion_matrix(clf, xtrain, ytrain, normalize='true')
plt.show()
print('Confusion Matrix - Testing')
plot_confusion_matrix(clf, xtest, ytest, normalize='true')
plt.show()
cm = confusion_matrix(ytest, test_preds)
lst = [cm[0,0], cm[0,1], cm[1,1], cm[1,0]]
labels = ['Identified Flops', 'Misidentified Flops', 'Identified Hits', 'Misidentified Hits']
plt.figure(figsize=(10,6))
plt.title('Test Results')
sns.barplot(labels, lst, palette=['#d01c8b', '#f1b6da', '#4dac26', '#b8e186'])
plt.show()
return auc_train, auc_test
def demo(song_list, df_stats, df_data, scaler, svm_model, forest_model):
'''
Tests a set of songs to see if they are hits or flops.
Parameters:
song_list - List of song names to be tested.
df_stats - dataframe to pull song names from.
df_data - dataframe of processed data to pull from.
scaler - fit data scaler to transform test songs.
svm_model - model to be tested.
forest_model - model to be tested.
Returns:
The name of the test songs and whether they are a hit or flop for both models.
'''
# reset index to account for dropped tracks
df_stats = df_stats.reset_index()
# Retrieve index numbers from df_stats, then retrieve cleaned data from
idx = []
for song in song_list:
idx = idx + df_stats.index[df_stats.track == song].tolist()
songs = df_data.iloc[idx]
dummies = pd.get_dummies(songs['time_signature'], prefix='time_signature')
cat = pd.concat([dummies, songs['mode']], axis=1)
cont_var = songs.drop(['time_signature', 'mode'], axis=1)
scaled = pd.DataFrame(scaler.transform(cont_var), index=cont_var.index, columns = cont_var.columns)
sample_test = pd.concat([scaled, cat], axis=1)
if 'time_signature_1' not in sample_test.columns:
sample_test['time_signature_1'] = 0
if 'time_signature_3' not in sample_test.columns:
sample_test['time_signature_3'] = 0
if 'time_signature_4' not in sample_test.columns:
sample_test['time_signature_4'] = 0
if 'time_signature_5' not in sample_test.columns:
sample_test['time_signature_5'] = 0
if 'time_signature_2' in sample_test.columns:
sample_test = sample_test.drop(['time_signature_2'], axis=1)
print('SVM Results:')
svm_results = svm_model.predict(sample_test)
for i, result in enumerate(svm_results):
name = df_stats.iloc[idx[i]]['track']
artist = df_stats.iloc[idx[i]]['artist']
if result == 1:
hit = 'hit'
else:
hit = 'flop'
print(f'{name} by {artist}: {hit}')
print('\n')
print("Random Forest Results:")
forest_results = forest_model.predict(sample_test)
for i, result in enumerate(forest_results):
name = df_stats.iloc[idx[i]]['track']
artist = df_stats.iloc[idx[i]]['artist']
if result == 1:
hit = 'hit'
else:
hit = 'flop'
print(f'{name} by {artist}: {hit}')
return sample_test