# Bach Chorales Project 
-----------------------

In [1]:
# Import Corner for Libraries ::In Alphabetical Order::
import pandas as pds
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

## Loading and Viewing Bach CSV

In [2]:
# Load and Show Bach Data Provided by Professor
chorus=pds.read_csv("https://raw.githubusercontent.com/AgentTailhawk/PotatoVerse1_ml-class/master/Student%20Responses/projects/proj_data/bach.csv")
chorus

Unnamed: 0,choral_ID,event_number,C,C#,D,D#,E,F,F#,G,G#,A,A#,B,bass,meter,chord_label
0,000106b_,1,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,NO,NO,F,3,F_M
1,000106b_,2,YES,NO,NO,NO,YES,NO,NO,YES,NO,NO,NO,NO,E,5,C_M
2,000106b_,3,YES,NO,NO,NO,YES,NO,NO,YES,NO,NO,NO,NO,E,2,C_M
3,000106b_,4,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,NO,NO,F,3,F_M
4,000106b_,5,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,NO,NO,F,2,F_M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5660,015505b_,105,NO,NO,YES,NO,NO,NO,NO,YES,NO,NO,YES,NO,G,4,G_m
5661,015505b_,106,NO,NO,YES,NO,NO,NO,NO,YES,NO,YES,NO,NO,G,3,G_m
5662,015505b_,107,YES,NO,NO,NO,YES,NO,NO,YES,NO,NO,NO,NO,C,5,C_M
5663,015505b_,108,YES,NO,NO,NO,YES,NO,NO,YES,NO,NO,YES,NO,C,3,C_M


## Forming the Training and Testing Datasets

In [3]:
# Identify Column Names
colNames=list(chorus.columns.values.tolist())
colNames

['choral_ID',
 'event_number',
 'C',
 'C#',
 'D',
 'D#',
 'E',
 'F',
 'F#',
 'G',
 'G#',
 'A',
 'A#',
 'B',
 'bass',
 'meter',
 'chord_label']

Based on the Chorus Table only the Twelve Western Scale Notes, Bass, and Meter will be needed to determine the correct chord. Therefore, the rest of the columns are deemed unnecessary.

In [4]:
# Remove Unnecessary Column Names
colNames.remove('choral_ID')
colNames.remove('event_number')
colNames.remove('chord_label')
colNames

['C',
 'C#',
 'D',
 'D#',
 'E',
 'F',
 'F#',
 'G',
 'G#',
 'A',
 'A#',
 'B',
 'bass',
 'meter']

In addition, some chords have only one instance causing issues when splitting data into test and training data. Therefore, those chords will be removed to prevent such issues.

In [5]:
# Viewing Chords with only 1 instance
minChords = chorus['chord_label'].value_counts(ascending=True)[:8].index.to_list()
chorus['chord_label'].value_counts(ascending=True)[:8]

G#M     1
F#d7    1
F_d7    1
Abd     1
Ebd     1
D#d6    1
EbM7    1
DbM7    1
Name: chord_label, dtype: int64

In [6]:
# Finding and Removing indices of Chords with only 1 Instance
for i in minChords:
  chorus.drop(chorus.loc[chorus['chord_label'] == i].index[0], inplace=True)
# Reset Index
chorus = chorus.reset_index()

In [7]:
# Double Check Chords are Gone
chorus['chord_label'].value_counts(ascending=True)[:8]

Dbd     2
C_d6    2
A_M6    2
C_d7    2
Dbd7    2
Abm     2
A_m4    2
B_m6    2
Name: chord_label, dtype: int64

In [8]:
# Seperate Needed Columns and Data into Features and Labels
c_Features=chorus[colNames]
c_Labels=chorus['chord_label']

In [9]:
# View Features
c_Features

Unnamed: 0,C,C#,D,D#,E,F,F#,G,G#,A,A#,B,bass,meter
0,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,NO,NO,F,3
1,YES,NO,NO,NO,YES,NO,NO,YES,NO,NO,NO,NO,E,5
2,YES,NO,NO,NO,YES,NO,NO,YES,NO,NO,NO,NO,E,2
3,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,NO,NO,F,3
4,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,NO,NO,F,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5652,NO,NO,YES,NO,NO,NO,NO,YES,NO,NO,YES,NO,G,4
5653,NO,NO,YES,NO,NO,NO,NO,YES,NO,YES,NO,NO,G,3
5654,YES,NO,NO,NO,YES,NO,NO,YES,NO,NO,NO,NO,C,5
5655,YES,NO,NO,NO,YES,NO,NO,YES,NO,NO,YES,NO,C,3


In [10]:
# View Labels
c_Labels

0       F_M
1       C_M
2       C_M
3       F_M
4       F_M
       ... 
5652    G_m
5653    G_m
5654    C_M
5655    C_M
5656    F_M
Name: chord_label, Length: 5657, dtype: object

In [11]:
# One Hot Encode Features
abzu = LabelEncoder()
# - Manually Encode Notes
for i in colNames[0:12]:
  c_Features[i] = c_Features[i].apply(lambda x: 1 if x == 'YES' else 0)
# - Auto Encode Bass
c_Features[colNames[12]] = abzu.fit_transform(c_Features[colNames[12]])
# View Features
c_Features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_Features[i] = c_Features[i].apply(lambda x: 1 if x == 'YES' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_Features[colNames[12]] = abzu.fit_transform(c_Features[colNames[12]])


Unnamed: 0,C,C#,D,D#,E,F,F#,G,G#,A,A#,B,bass,meter
0,1,0,0,0,0,1,0,0,0,1,0,0,12,3
1,1,0,0,0,1,0,0,1,0,0,0,0,10,5
2,1,0,0,0,1,0,0,1,0,0,0,0,10,2
3,1,0,0,0,0,1,0,0,0,1,0,0,12,3
4,1,0,0,0,0,1,0,0,0,1,0,0,12,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5652,0,0,1,0,0,0,0,1,0,0,1,0,14,4
5653,0,0,1,0,0,0,0,1,0,1,0,0,14,3
5654,1,0,0,0,1,0,0,1,0,0,0,0,5,5
5655,1,0,0,0,1,0,0,1,0,0,1,0,5,3


In [12]:
# One Hot Encode Labels
flyff=LabelEncoder()
c_Labels=flyff.fit_transform(c_Labels)
# View New Labels
c_Labels

array([75, 34, 34, ..., 34, 34, 75])

To accomdate for chords that only have 2 instances, stratification occurs during train_test_split.

In [13]:
# Seperate into Training and Testing
c_F_Train, c_F_Test, c_L_Train, c_L_Test = train_test_split(c_Features, c_Labels, test_size=0.2, random_state=42, stratify=c_Labels)

## Finding the Right Classifier

For this project, I'll be using 4 Different Approaches to determine which classifier works best for this dataset:
* DecisionTreeClassifier()
  * 0: Gini Criteria
  * 1: Entropy Criteria
* 2: KNNClassifier()
* BaggingClassifier()
  * 3: Bagging[Bootstrap=True]
  * 4: Pasting[Bootstrap=False]
* 5: XGBoostClassifier()


In [14]:
# Array to Hold Accuracy Values
accArray = []

### Decision Tree Classifier

In [15]:
# Range of Hyper Paramters
hyperparam_grid = [
    {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 
     'min_samples_split': [2, 3, 4, 5, 6, 7, 8 ,9 ,10]}
  ]

#### Gini Criterion

In [16]:
# Create
clf=tree.DecisionTreeClassifier()
grid_search = GridSearchCV(clf, hyperparam_grid, cv=2)

In [17]:
# Fit
%%time
grid_search.fit(c_F_Train,c_L_Train)

CPU times: user 2.07 s, sys: 11.6 ms, total: 2.08 s
Wall time: 2.35 s


In [18]:
# Best Fit
pine=grid_search.best_params_
pine

{'max_depth': 12, 'min_samples_split': 2}

In [19]:
# Prediction and Accuracy
predict=grid_search.best_estimator_.predict(c_F_Test)
acc = accuracy_score(c_L_Test, predict)
print("Accuracy: %f Percent" % (acc*100))

Accuracy: 72.526502 Percent


In [20]:
# Add to Accuracy List
accArray.append(acc)

#### Entropy Criterion

In [21]:
# Create
clf2=tree.DecisionTreeClassifier(criterion='entropy')
grid_search = GridSearchCV(clf2, hyperparam_grid, cv=2)

In [22]:
# Fit
%%time
grid_search.fit(c_F_Train,c_L_Train)

CPU times: user 2.86 s, sys: 56.4 ms, total: 2.92 s
Wall time: 5.14 s


In [23]:
# Best Fit
grid_search.best_params_

{'max_depth': 12, 'min_samples_split': 2}

In [24]:
# Prediction and Accuracy
predict = grid_search.best_estimator_.predict(c_F_Test)
acc = accuracy_score(c_L_Test, predict)
print("Accuracy: %f Percent" % (acc*100))

Accuracy: 72.703180 Percent


In [25]:
# Add to Accuracy List
accArray.append(acc)

### K Nearest Neighbor(s) Classifier

In [26]:
# Range of Hyper Paramters
hyperparam_grid = [
    {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
  ]

In [27]:
# Create
knn=KNeighborsClassifier()
grid_search = GridSearchCV(knn, hyperparam_grid, cv=2)

In [28]:
# Fit
%%time
grid_search.fit(c_F_Train,c_L_Train)

CPU times: user 2.16 s, sys: 18.4 ms, total: 2.18 s
Wall time: 2.99 s


In [29]:
# Best Fit
grid_search.best_params_

{'n_neighbors': 4}

In [30]:
# Prediction and Accuracy
predict=grid_search.best_estimator_.predict(c_F_Test)
acc = accuracy_score(c_L_Test, predict)
print("Accuracy: %f Percent" % (acc*100))

Accuracy: 70.229682 Percent


In [31]:
# Add to Accuracy List
accArray.append(acc)

### Bagging and Pasting Classifier

Uses Decision Tree with Criterion Entropy and it's best paramaters since it has the highest accuracy most of the time.

In [32]:
# Range of Hyper Paramters
hyperparam_grid = [
    {'n_estimators': [50, 100, 125, 150, 175, 200],
     'max_samples': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 
     'max_features': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
  ]

####Bagging
Takes ~8 Mins

In [33]:
# Create
clf3 = tree.DecisionTreeClassifier(max_depth=pine['max_depth'], min_samples_split=pine['min_samples_split'])
bgc = BaggingClassifier(clf3, bootstrap=True)
grid_search = GridSearchCV(bgc, hyperparam_grid, cv=2, n_jobs=-1)

In [34]:
# Fit
%%time
grid_search.fit(c_F_Train,c_L_Train)

CPU times: user 5.3 s, sys: 627 ms, total: 5.92 s
Wall time: 7min 54s


In [35]:
# Best Fit
grid_search.best_params_

{'max_features': 0.9, 'max_samples': 0.3, 'n_estimators': 150}

In [36]:
# Prediction and Accuracy
predict=grid_search.best_estimator_.predict(c_F_Test)
acc = accuracy_score(c_L_Test, predict)
print("Accuracy: %f Percent" % (acc*100))

Accuracy: 75.088339 Percent


In [37]:
# Add to Accuracy List
accArray.append(acc)

#### Pasting
Takes ~8 Mins

In [38]:
# Create
clf4 = tree.DecisionTreeClassifier(max_depth=pine['max_depth'], min_samples_split=pine['min_samples_split'])
pgc = BaggingClassifier(clf4, bootstrap=False)
grid_search = GridSearchCV(pgc, hyperparam_grid, cv=2, n_jobs=-1)

In [39]:
# Fit
%%time
grid_search.fit(c_F_Train,c_L_Train)

CPU times: user 5.26 s, sys: 598 ms, total: 5.86 s
Wall time: 8min 13s


In [40]:
# Best Fit
grid_search.best_params_

{'max_features': 0.8, 'max_samples': 0.4, 'n_estimators': 100}

In [41]:
# Prediction and Accuracy
predict=grid_search.best_estimator_.predict(c_F_Test)
acc = accuracy_score(c_L_Test, predict)
print("Accuracy: %f Percent" % (acc*100))

Accuracy: 74.823322 Percent


In [42]:
# Add to Accuracy List
accArray.append(acc)

###XGBoost Classifier

In [43]:
# Range of Hyper Paramters
hyperparam_grid = [
    {'n_estimators': [10, 15, 25, 35, 45, 50, 75, 100, 125, 150, 175, 200],
     'max_depth': [1, 2, 3, 4, 5, 6, 7, 8]}
  ]

In [44]:
# Create Classifier
model = XGBClassifier(tree_method='gpu_hist', predictor='gpu_predictor')
model

In [45]:
# Find Best Parameters
param_comb = 5
folds=2
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
random_search = RandomizedSearchCV(model, param_distributions=hyperparam_grid, n_iter=param_comb,  n_jobs=-1, cv=skf.split(c_F_Train,c_L_Train), verbose=3)

In [46]:
# Fit
%%time 
grid_result = random_search.fit(c_F_Train, c_L_Train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits




CPU times: user 4.32 s, sys: 388 ms, total: 4.71 s
Wall time: 1min 29s


In [47]:
# Best Parameter
random_search.best_params_

{'n_estimators': 50, 'max_depth': 5}

In [48]:
# Prediction and Accuracy
predictions = random_search.best_estimator_.predict(c_F_Test)
acc = accuracy_score(c_L_Test, predictions)
print("Accuracy: %f Percent" % (acc*100))

Accuracy: 73.851590 Percent


In [49]:
# Add to Accuracy List
accArray.append(acc)

# Best Classifier

In [53]:
for index, i in enumerate(accArray):
  print(f"{index}: {i}")

0: 0.7252650176678446
1: 0.7270318021201413
2: 0.7022968197879859
3: 0.7508833922261484
4: 0.7482332155477032
5: 0.7385159010600707


In [50]:
# Determines which classifier had the best accuracy
# j stores highest accuracy
j = accArray[0]

# cl stores index of highest accuracy
cl = 0

# Checks if any value of i is greater than j value
# Then saves higher value to j and current index to cl
for index, i in enumerate(accArray):
  if i > j:
    j = i
    cl = index

# Outputs Best Classifier and Accuracy based on Index
if cl == 0:
  print("DecisionTree(Gini) Accuracy: %f Percent" % (j*100))
elif cl == 1:
  print("DecisionTree(Entropy) Accuracy: %f Percent" % (j*100))
elif cl == 2:
  print("KNNeighbors() Accuracy: %f Percent" % (j*100))
elif cl == 3:
  print("BaggingClassifier(Bootstrap=True) Accuracy: %f Percent" % (j*100))
elif cl == 4:
  print("BaggingClassifier(Bootstrap=False) Accuracy: %f Percent" % (j*100))
elif cl == 5:
  print("XGBClassifier() Accuracy: %f Percent" % (j*100))

BaggingClassifier(Bootstrap=True) Accuracy: 75.088339 Percent
