# <center> GeNNus
### <center> Logistic Regression on Waveforms

This is the code for the step 1, the one in which we experimented with Logistic Regression on Waveforms.

See project report and presentation for deeper thoughts about this stage.

In [None]:
import torch
import tqdm
import seaborn as sns
import matplotlib.pyplot  as plt
import os
import numpy as np
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import random 
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [None]:
DATASET_SIZE = "xs"
DATASET_TYPE = "waveform"

DATASET_FOLDER = f"./data/{DATASET_TYPE}"

DATASET_NUM_SAMPLES_PER_SECOND = 8000
DATASET_NUM_CHANNELS = 1

DATASET_NAME = f"fma_{DATASET_SIZE}_resampled_{DATASET_NUM_SAMPLES_PER_SECOND}_rechanneled_{DATASET_NUM_CHANNELS}"

dataset_path = f"{DATASET_FOLDER}/{DATASET_NAME}"

In [None]:
audio_path_list = []

for path, subdirs, files in os.walk(dataset_path):
    for name in files:
        file_audio_path = os.path.join(path, name)
        print(file_audio_path)

        if name != '.DS_Store':
            audio_path_list.append(file_audio_path)

audio_path_list = sorted(audio_path_list , reverse= True) 

In [None]:
single_tensors = []
labels = []
for p in audio_path_list:
    single_tensors.append(torch.load(p))
    labels.append(p.split("/")[-2])


stacked_single_tensors = torch.cat(single_tensors).numpy()

In [None]:
data = pd.DataFrame(stacked_single_tensors)
data.insert(0, "Label", labels, True)

In [None]:
def train_test_split( dataset , perc_train , set_seed = 69):
    train = dataset.sample(frac= perc_train,random_state = set_seed )
    test  = dataset.drop(train.index)
    return (train , test )
#####   
train_dataset , test_dataset = train_test_split( data , perc_train = .85)

In [None]:
X = train_dataset.iloc[: , 1:]      # ALL THE DATA POINTS
y = train_dataset.iloc[: , :1]      # ALL THE LABELS

std_slc = StandardScaler()          # STANDARDIZE

pca = decomposition.PCA()           # PCA

logistic_Reg = linear_model.LogisticRegression()    # LINEAR REGRESSION

### 
pipe = Pipeline(steps=[('std_slc', std_slc),
                ('pca', pca),
                ('logistic_Reg', logistic_Reg)])


n_components = list(range( int(.01 * X.shape[0]) ,  int(.25 * X.shape[0]) , 1))    # NUMBER OF PCA WE TEST
# WE TRIED DIFFERENTS COMBINATIONS AND WE ENDED UP DISCOVERING THAT FEW PRINCIPAL COMPONENTS WERE ENOUGH
# WE NOW TEST ONLY ON THE FIRST 25% OF THE POSSIBLE NUMBER OF PC's.

C = [2., 20.]      # NUMBER OF C VALUE WE TEST

penalty = ['l1', 'l2']    # PENALTY WE TEST

# COMBINE ALL IN A DICTIONARY
parameters = dict(pca__n_components=n_components,
                    logistic_Reg__C=C,
                    logistic_Reg__penalty=penalty)


In [None]:
### RUN THE KFOLD CROSS VALIDATION TO DISCOVER THE BEST HYPERPARAMETERS
clf = GridSearchCV(pipe, parameters, n_jobs=2, verbose=3, cv=3)
clf.fit(X, y)

In [None]:
print('Best Penalty:', clf.best_estimator_.get_params()['logistic_Reg__penalty'])
print('Best C:', clf.best_estimator_.get_params()['logistic_Reg__C'])
print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])
print(); print(clf.best_estimator_.get_params()['logistic_Reg'])

In [None]:
# WE DON'T KNOW WHY IS HERE
#CV_result = cross_val_score(clf , X , y , cv = 3 , n_jobs=2, verbose=3)

## Run the Logistcics regression with the best hyperparameters

In [None]:
X_train = train_dataset.iloc[: , 1:]      # ALL THE DATA POINTS for the training set 
y_train = train_dataset.iloc[: , :1]      # ALL THE LABELSfor the training set 

X_test =  test_dataset.iloc[: , 1:]       # ALL THE DATA POINTS for the test set 
y_test =  test_dataset.iloc[: , :1]       # ALL THE LABELS for the test set

# [TODO]  do not arcoding but with the best hyperparameter

pca = PCA(n_components= 11)               # PCA with the best number of components
X_train_pca =  pd.DataFrame(pca.fit_transform(X_train))   # Transform the train dataset in pc
X_test_pca  =  pd.DataFrame(pca.fit_transform(X_test ))   # Transform the test  dataset in pc

### RUN THE LOGISTIC REGRESSION

logistic = LogisticRegression(penalty= 'l2'  , C = 2)
logistic.fit( X_train_pca  , y_train )

### TEST THE MODEL
LogisticPredictions = logistic.predict( X_test_pca )

In [None]:
Logistic_Accuracy = logistic.score(  X_test_pca , y_test ) 
Logistic_Accuracy ="{: .0%}".format(Logistic_Accuracy)
print(f"The accuracy of the logistic model is:{Logistic_Accuracy}")

In [None]:
### CONFUSION MATRIX
confusion = metrics.confusion_matrix( y_test , LogisticPredictions)
plt.figure(figsize=(12,10))
sns.heatmap(confusion , annot= True , fmt = 'd' , cmap= "viridis")
plt.xlabel("Recognized Genres")
plt.ylabel("Actual Genre")
plt.show()

In [None]:

 
print('Shape before PCA: ', X.shape)
print('Shape after PCA: ', pca_features.shape)

In [None]:
import matplotlib.pyplot as plt 

plt.rcParams["figure.figsize"] = (12,8)

fig, ax = plt.subplots()
xi = np.arange(1, 61, step=1)
y = np.cumsum(pca.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, y, marker='o', linestyle='-', color='red')

plt.xlabel('Number of Components')
plt.xticks(np.arange(1, 61, step=1), rotation = 90) 
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.7, color='darkgreen', linestyle='--')
plt.text(1.1, 1, '70% cut-off threshold', color = 'darkgreen', fontsize=16)

ax.grid()
plt.tight_layout()
plt.show()

In [None]:
test_dataset.iloc[: , 1:]

In [None]:
pca = PCA(n_components= 7  )
dataset_pca = pca.fit_transform(train_dataset.iloc[: , 1:])


In [None]:
label = train_dataset.iloc[ : , :1 ]

In [None]:
labels_train =[]
for x in label.values:
    labels_train.append(_label_from_str_to_one_hot(x[0]))
    

In [None]:
dataset_pca = pd.DataFrame(dataset_pca)

In [None]:
pca_test = PCA(n_components= 52 )
test_dataset_pca = pca.fit_transform(test_dataset.iloc[: , 1:])

In [None]:
lab_test = pd.get_dummies(test_dataset.iloc[ : , :1 ])

In [None]:
lab_test

In [None]:
logistic.predict(test_dataset_pca)


In [None]:
def _label_from_str_to_one_hot(label_str: str): 

    if label_str == "Pop":
        return [1, 0, 0, 0, 0, 0]
    
    if label_str == "Hip-Hop":
        return [0, 1, 0, 0, 0, 0]
    
    if label_str == "Electronic":
        return [0, 0, 1, 0, 0, 0]
    
    if label_str == "Rock":
        return [0, 0, 0, 1, 0, 0]

    if label_str == "Folk":
        return [0, 0, 0, 0, 1, 0]

    if label_str == "Jazz":
        return [0, 0, 0, 0, 0, 1]

In [None]:
results 

    

In [None]:

LogisticPredictions = logistic.predict( test_dataset_pca )
results = []
for x in LogisticPredictions:
    results.append(_label_from_str_to_one_hot(x))

lab_test = []
for x in label_test.values:
    lab_test.append(_label_from_str_to_one_hot(x[0]))


Logistic_Accuracy = logistic.score(  results , lab_test ) 

In [None]:
LogisticPredictions

In [None]:
lab_test = []
for x in label_test.values:
    lab_test.append(x[0])

In [None]:
lab_test

In [None]:
Logistic_Accuracy

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix( Y_test , LogisticPredictions , labels = None, sample_weight=None, normalize=None)

In [None]:
from sklearn.model_selection import KFold
key_fold = KFold( n_splits=4 ) 
key_fold.get_n_splits(X_train)

In [None]:
import pandas as pd
df = pd.DataFrame(stacked_single_tensors)
X = df.iloc[:,:-1]
y = labels


In [None]:
#Importing required libraries
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import utils
from tqdm import tqdm
set.seed(69)


X = df.iloc[:,:-1]

#convert y values to categorical values

y = pd.DataFrame(labels)

#Implementing cross validation

k = 3
kf = KFold(n_splits=k, random_state= None , shuffle= True )
model = LogisticRegression()

acc_score = []

for train_index , test_index in tqdm( kf.split(X) ):

    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y.iloc[train_index] , y.iloc[test_index]


    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)

    acc = accuracy_score( pred_values , y_test)
    acc_score.append(acc)

avg_acc_score = sum(acc_score)/k
