In Train_Arabic_Digit.txt there are 660 blocks for each spoken digit .The first 330 blocks
represent male speakers and the second 330 blocks represent the female speakers. Blocks 1-660
represent the spoken digit "0" (10 utterances of /0/ from 66 speakers), blocks 661-1320 represent
the spoken digit "1" (10 utterances of /1/ from the same 66 speakers 33 males and 33 females
), and so on up to digit 9.

In Test_Arabic_Digit.txt, digits 0 to 9 have 220 blocks for each one. The first 110 blocks
represent male speakers and the second 110 blocks represent the female speakers. Therefore,
blocks 1-220 represent digit "0" (10 utterances of /0/ from the 22 speakers ), blocks
221-440 represent digit "1" (10 utterances of /1/ from the same 22 speakers 11 males and 11
females ), and so on.

Import Libraries:
=

In [15]:
import pandas as pd
import os
from io import StringIO
import matplotlib.pyplot as plt
import numpy as np
import scipy
from scipy import stats
from scipy.stats import beta
import math
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from scipy.stats import multivariate_normal
from sklearn import svm
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import seaborn as sns
from scipy.spatial.distance import cdist #elbow plot
from sklearn.decomposition import PCA
from operator import truediv #metrics

Import Data:
=

In [16]:
# read in data that is locally stored
notebook_path = os.path.abspath("final_project.ipynb")
train_text = os.path.join(os.path.dirname(notebook_path), "data/Train_Arabic_Digit.txt")
test_text = os.path.join(os.path.dirname(notebook_path), "data/Test_Arabic_Digit.txt")

In [17]:
# read in txt files as csv data frames
train_df = pd.read_csv(train_text, sep=" ")
train_df.columns = ['mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13']
test_df = pd.read_csv(test_text, sep=" ")
test_df.columns = ['mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13']

#in train, row 38 has empty (NaN) row (separating blocks)

In [18]:
#create a col that separates blocks 
train_df["group_no"] = train_df.isnull().all(axis=1).cumsum()
#remove NaN rows, reset index
train_df_split = train_df.dropna().reset_index(drop=True) 

In [19]:
test_df["group_no"] = test_df.isnull().all(axis=1).cumsum()
test_df_split = test_df.dropna().reset_index(drop=True)

Data Parsing:
=

In [6]:
def get_block_data(data, digit_num, test = False, num_blocks = 1):
    if test:
        size = 220*num_blocks
    else:
        size = 660*num_blocks
    return (data.loc[data['group_no'] == ((size*digit_num) + 1)]).iloc[:, 0:13]

In [7]:
def get_1block_data(data, block_number = 1):
    return (data.loc[data['group_no'] == block_number]).iloc[:, 0:13]

In [8]:
def get_digit_data(data, digit_num, test = False, num_blocks = 1):
    if test:
        size = 220*num_blocks
    else:
        size = 660*num_blocks
        
    subset = data.loc[(data['group_no'] >= size*digit_num) & (data['group_no'] < (size*(digit_num+1)))]
    return subset.iloc[:, 0:13] #removes group_no col

In [9]:
def get_gender_data(data, test = False):
    female_list = []
    male_list = []
    
    if test:
        block_size = 110
    else:
        block_size = 330
    
    for i in range(0, 19):
        male_curr = train_df_split.loc[(train_df_split['group_no'] >= (block_size*i)) & (train_df_split['group_no'] < (block_size*(i+1)))]
        male_list.append(male_curr)
        i += 2
    
    for i in range(1, 20):
        fem_curr = train_df_split.loc[(train_df_split['group_no'] >= (block_size*i)) & (train_df_split['group_no'] < (block_size*(i+1)))]
        female_list.append(fem_curr)
        i += 2
        
    males_all = pd.concat(male_list, ignore_index = True, axis = 0)
    females_all = pd.concat(female_list, ignore_index = True, axis = 0)
    
    return males_all, females_all
        

In [10]:
#concatenate
males_train_all = pd.concat([male_0, male_1, male_2, male_3, male_4, male_5, male_6, male_7, male_8, male_9], ignore_index=True,axis=0)
females_train_all = pd.concat([female_0, female_1, female_2, female_3, female_4, female_5, female_6, female_7, female_8, female_9], ignore_index=True,axis=0)
males_train = males_train_all.iloc[:, 0:13]
females_train = females_train_all.iloc[:, 0:13]

NameError: name 'male_0' is not defined

In [11]:
males_train_all = get_gender_data(train_df_split)[0]

In [None]:
#generalize data plotting MFCC vs. window index

def plot_MFCC_window(data, digit_num, flatten = False):
    block = get_block_data(data, digit_num)
    if flatten:
        plt.plot(block.to_numpy().flatten())
    else:
        plt.plot(block)
    plt.title(f"Digit: {digit_num}")
    plt.xlabel("Window Index")
    plt.ylabel("MFCC Value")
    # plt.legend(block, bbox_to_anchor=(1, 1))

In [None]:
for i in range(10):
    plot_MFCC_window(train_df_split, i)
    plt.show()

In [None]:
plt.figure(figsize=(15, 6))

rows = 2
cols = 5
counter = 1

for i in range(10):
    plt.subplot(rows, cols, counter) 
    plt.subplots_adjust(top = 1.5, bottom = 0.1, left = 0.5, right = 1.9)
    plot_MFCC_window(train_df_split, i)
    counter += 1
    
#plt.legend(train_df_split, bbox_to_anchor=(1, 1))

In [None]:
# pairwise plot
sns.pairplot(train_df_split.drop(columns = 'group_no'), hue='label')

K-Means:
=

In [None]:
def get_kmeans_model(data, num_clusters, digit_num):
    digit_data = get_digit_data(data, digit_num)
    return KMeans(n_clusters = num_clusters).fit(digit_data)

In [None]:
def plot_GMM_kmeans_scatter(data, num_clusters, digit_num, mfcc_x, mfcc_y):
    digit_data = get_digit_data(data, digit_num)
    
    kmeans = get_kmeans_model(data, num_clusters, digit_num)
    pred = kmeans.predict(digit_data)
    
    plt.scatter(digit_data.to_numpy()[:,mfcc_x], digit_data.to_numpy()[:,mfcc_y], c = pred, cmap = "RdBu")
    plt.title(f"Digit {digit_num}")
    plt.xlabel(f"MFCC {mfcc_x + 1}") # try based on 2 MFCCs from visualization that are most different 
    plt.ylabel(f"MFCC {mfcc_y + 1}")

In [None]:
plt.figure(figsize=(15, 6))

rows = 2
cols = 5
counter = 1

for i in range(10):
    plt.subplot(rows, cols, counter) 
    plt.subplots_adjust(top = 1.5, bottom = 0.1, left = 0.3, right = 1.9)
    if (i == 1) or (i == 2) or (i == 4) or (i == 6): 
        num_clusters = 3
    elif (i == 8) or (i == 9):
        num_clusters = 5
    else:
        num_clusters = 4
    plot_GMM_kmeans_scatter(train_df_split, num_clusters, i, 0, 1)
    counter += 1

In [None]:
def calculate_cluster_characteristics_2MFCCs(data, num_clusters, digit_num, mfcc_x, mfcc_y):
    digit_data = get_digit_data(data, digit_num)
    clusters = list(zip(digit_data.to_numpy()[:,mfcc_x], digit_data.to_numpy()[:,mfcc_y]))
    
    kmeans = get_kmeans_model(data, num_clusters, digit_num)
    pred = kmeans.predict(clusters)
    
    res = []
    means = []
    covars = []
    for i in range(num_clusters):
        pts = data[pred == i]
        means.append(np.mean(pts, axis = 0))
        covars.append(np.cov(pts.T))
        res.append(len(pts)/len(pred))
    
    return means, covars, res

In [None]:
def calculate_cluster_characteristics(data, num_clusters, i):
    kmeans = get_kmeans_model(data, num_clusters, digit_num)
    digit_data = get_digit_data(data, digit_num)
    pred = kmeans.predict(digit_data)
    
    res = []
    means = []
    covars = []
    for i in range(num_clusters):
        pts = data[pred == i]
        means.append(np.mean(pts, axis = 0))
        covars.append(np.cov(pts.T))
        res.append(len(pts)/len(pred))
    
    return means, covars, res

In [None]:
def prob(x, means, covs, res):
    ans = 0
    for i in range(4):
        ans += res[i] * multivariate_normal.pdf(x, means[i], covs[i])
    return ans

In [None]:
kmeans_models_new = []
for i in range(10):
    if i == 2:
        num_clusters = 3
    elif (i == 4) or (i == 8) or (i == 9) or (i == 7):
        num_clusters = 5
    else:
        num_clusters = 4
    
    kmeans_current = get_kmeans_model(train_df_split, num_clusters, i)
    kmeans_models_new.append(kmeans_current)

In [None]:
def plot_GMM_kmeans_contour(data, num_clusters, digit_num, mfcc_x, mfcc_y):
    
    x, y = np.mgrid[-10:10:1, -10:10:1]
    pos = np.dstack((x, y))
    
    means, covs, res = calculate_cluster_characteristics(data, num_clusters, digit_num, mfcc_x, mfcc_y)

    z = []
    for p in pos:
        z.append(prob(p, means, covs, res))
    z = np.array(z).reshape(x.shape)

    plt.contourf(x, y, z)
    plt.title(f"Contour plot: pdf of K-Means on Digit {digit_num}")
    plt.xlabel(f"MFCC {mfcc_x + 1}") 
    plt.ylabel(f"MFCC {mfcc_y + 1}")

EM:
=

In [None]:
def get_em_model(data, num_clusters, digit_num, mfcc_x, mfcc_y): # change to mfcc_lists
    digit_data = get_digit_data(data, digit_num)[['mfcc_1', 'mfcc_2']]
    # EXPLORE: GM max_iter (100 was ok for hw 5 but need more here)
    return GaussianMixture(n_components = num_clusters, max_iter = 10000, verbose = 1, verbose_interval = 1000).fit(digit_data)

In [None]:
gm_verb = GaussianMixture(n_components = 4, max_iter = 100, verbose = 1, verbose_interval = 1000)
gm_verb.fit(digit_data)

In [None]:
gmm = GaussianMixture(n_components=4, tol=1e-8,verbose=2,verbose_interval=1)
df = gmm.fit(digit_data)

In [None]:
def get_em_model_new(data, num_clusters, digit_num): # change to mfcc_lists
    digit_data = get_digit_data(data, digit_num)
    return GaussianMixture(n_components = num_clusters, max_iter = 10000000).fit(digit_data)

In [None]:
def predict_label_diag(block_data, diag_models):

    score_list = []
    for i in range(10):
        diag = diag_models[i]
        means = diag.means_ 
        covs = diag.covariances_
        res =  diag.weights_ 
        num_clusters = res.shape[0]
        score = prob_test(block_data, means, covs, res, num_clusters) 
        score_list.append(score)

    label_predict = score_list.index(max(score_list))
    return label_predict

In [None]:
# FIX FOR ALL 13 INSTEAD OF 2

def plot_GMM_EM_scatter(data, num_clusters, digit_num, mfcc_x, mfcc_y):
    digit_data = get_digit_data(data, digit_num)
    clusters = list(zip(digit_data.to_numpy()[:,0], digit_data.to_numpy()[:,1], digit_data.to_numpy()[:,2], digit_data.to_numpy()[:,3], digit_data.to_numpy()[:,4], digit_data.to_numpy()[:,5], digit_data.to_numpy()[:,6], digit_data.to_numpy()[:,7], digit_data.to_numpy()[:,8], digit_data.to_numpy()[:,9], digit_data.to_numpy()[:,10], digit_data.to_numpy()[:,11], digit_data.to_numpy()[:,12]))
    gm = get_em_model_new(data, num_clusters, digit_num)
    pred = gm.predict_proba(clusters)
    
    color_labels = [np.array([255,0,0])/255, np.array([0,255,0])/255, np.array([0,0,255])/255, np.array([0,255,255])/255]
    
    #make rgba matrix
    for num in range(num_clusters):
        #initialize rgba
        rgba = np.zeros((len(clusters), 4))
        rgba[:,0] = color_labels[num][0]
        rgba[:,1] = color_labels[num][1]
        rgba[:,2] = color_labels[num][2]
        rgba[:,3] = pred[:,num]
        plt.scatter(digit_data.to_numpy()[:,mfcc_x], digit_data.to_numpy()[:,mfcc_y], c = rgba, cmap = 'RdBu')
    
    plt.title(f"Scatter plot: EM on Digit {digit_num}")
    plt.xlabel(f"MFCC {mfcc_x + 1}") 
    plt.ylabel(f"MFCC {mfcc_y + 1}")

In [None]:
for i in range(10):
    if (i == 1) or (i == 2) or (i == 4) or (i == 6): 
        num_clusters = 3
    elif (i == 8) or (i == 9):
        num_clusters = 5
    else:
        num_clusters = 4
    plot_GMM_EM_scatter(train_df_split, num_clusters, i, 0, 1)
    plt.show()

In [None]:
def plot_GMM_EM_contour(data, num_clusters, digit_num, mfcc_x, mfcc_y):
    digit_data = get_digit_data(data, digit_num)
    # clusters = list(zip(digit_data.to_numpy()[:,mfcc_x], digit_data.to_numpy()[:,mfcc_y]))
    gm = get_em_model_new(data, num_clusters, digit_num)
    
    x, y = np.mgrid[-10:10:.01, -15:8:.01] #increased resolution
    pos = np.dstack((x, y))
    z = []
    for p in pos:
        z.append(gm.score_samples(p))
    z = np.array(z).reshape(x.shape)

    plt.contourf(x, y, z)
    plt.title(f"Contour plot: pdf of EM on Digit {digit_num}")
    plt.xlabel(f"MFCC {mfcc_x + 1}") 
    plt.ylabel(f"MFCC {mfcc_y + 1}")

In [None]:
for i in range(10):
    
    if (i == 1) or (i == 2) or (i == 4) or (i == 6): 
        num_clusters = 3
    elif (i == 8) or (i == 9):
        num_clusters = 5
    else:
        num_clusters = 4
    
    plot_GMM_EM_contour(train_df_split, num_clusters, i, 0, 1)
    plt.show()

Model Exploration (different covariance matrices):
=

In [None]:
def get_diag_model(data, num_clusters, digit_num): 
    digit_data = get_digit_data(data, digit_num)
    return GaussianMixture(n_components = num_clusters, max_iter = 10000000, covariance_type = "diag").fit(digit_data)

In [None]:
diag_models = []
for i in range(10):
    if (i == 1) or (i == 2) or (i == 4) or (i == 6): 
        num_clusters = 3
    elif (i == 8) or (i == 9):
        num_clusters = 5
    else:
        num_clusters = 4
    diag_current = get_diag_model(train_df_split, num_clusters, i)
    diag_models.append(diag_current)

In [None]:
predictions_diag = []
for i in tqdm(range(2200)):
    block = get_1block_data(test_df_split, block_number = i).iloc[:, 0:13] #just use 1st 2 coeff -- CHANGE all 13
    predictions_diag.append(predict_label_diag(block, diag_models))

In [None]:
plot_confusion_get_metrics(label_true, predictions_diag, "Diagonal Covariance")

In [None]:
#Tied full

for i in range(10):
    means = calculate_cluster_characteristics(train_df_split, 4, i, 1, 2)[0]
    covs = calculate_cluster_characteristics(train_df_split, 4, i, 1, 2)[1]
    res = calculate_cluster_characteristics(train_df_split, 4, i, 1, 2)[2]

    weighted_sum = np.zeros((2,2))

    for j in range(len(res)):
        weighted_sum += res[j]*covs[j]

    cov_full = [weighted_sum, weighted_sum, weighted_sum, weighted_sum]

    x, y = np.mgrid[-10:10:1, -8:8:1]
    pos = np.dstack((x, y))
 
    z = []
    for p in pos:
        z.append(prob(p, means, cov_full, res))
    z = np.array(z).reshape(x.shape)

    plt.contourf(x, y, z)
    plt.title(f"Digit {i}")
    plt.xlabel("MFCC 1")
    plt.ylabel("MFCC 2")
    plt.show()

In [None]:
for i in range(10):
    means = calculate_cluster_characteristics(train_df_split, 4, i, 1, 2)[0]
    covs = calculate_cluster_characteristics(train_df_split, 4, i, 1, 2)[1]
    res = calculate_cluster_characteristics(train_df_split, 4, i, 1, 2)[2]

    x, y = np.mgrid[-10:10:1, -8:8:1]
    pos = np.dstack((x, y))
 
    z = []
    for p in pos:
        z.append(prob(p, means, covs, res))
    z = np.array(z).reshape(x.shape)

    plt.contourf(x, y, z)
    plt.title(f"Digit {i}")
    plt.xlabel("MFCC 1")
    plt.ylabel("MFCC 2")
    plt.show()

In [None]:
# DIAGONAL COVARIANCE

def diag_cov(data, num_clusters, digit_num, mfcc_x, mfcc_y):
    digit_data = get_digit_data(data, digit_num)
    clusters = list(zip(digit_data.to_numpy()[:,mfcc_x], digit_data.to_numpy()[:,mfcc_y]))
    
    kmeans = get_kmeans_model(data, num_clusters, digit_num)
    pred = kmeans.predict(digit_data)

    pts_0 = []
    pts_1 = []
    pts_2 = []
    pts_3 = []
    for i in range(len(pred)):
        if pred[i] == 0:
            pts_0.append(clusters[i])
        if pred[i] == 1:
            pts_1.append(clusters[i])
        if pred[i] == 2:
            pts_2.append(clusters[i])
        if pred[i] == 3:
            pts_3.append(clusters[i])
        
    #calculate means, covariances, and responsibilities of each
    pts_0 = np.array(pts_0)
    mean_0 = np.mean(pts_0, axis = 0)
    covar_0 = np.cov(pts_0.T)
    res_0 = len(pts_0)/len(pred) #prior

    pts_1 = np.array(pts_1)
    mean_1 = np.mean(pts_1, axis = 0)
    covar_1 = np.cov(pts_1.T)
    res_1 = len(pts_1)/len(pred)

    pts_2 = np.array(pts_2)
    mean_2 = np.mean(pts_2, axis = 0)
    covar_2 = np.cov(pts_2.T)
    res_2 = len(pts_2)/len(pred)

    pts_3 = np.array(pts_3)
    mean_3 = np.mean(pts_3, axis = 0)
    covar_3 = np.cov(pts_3.T)
    res_3 = len(pts_3)/len(pred)

    means = [mean_0, mean_1, mean_2, mean_3]
    covs = [covar_0, covar_1, covar_2, covar_3]
    res = [res_0, res_1, res_2, res_3]
    
    covs_diag_0 = covar_0.copy()
    covs_diag_0[0,1] = 0
    covs_diag_0[1,0] = 0

    covs_diag_1 = covar_1.copy()
    covs_diag_1[0,1] = 0
    covs_diag_1[1,0] = 0

    covs_diag_2 = covar_2.copy()
    covs_diag_2[0,1] = 0
    covs_diag_2[1,0] = 0

    covs_diag_3 = covar_3.copy()
    covs_diag_3[0,1] = 0
    covs_diag_3[1,0] = 0

    covs_diags = [covs_diag_0, covs_diag_1, covs_diag_2, covs_diag_3]
    
    x, y = np.mgrid[-10:10:1, -10:10:1]
    pos = np.dstack((x, y))

    z = []
    for p in pos:
        z.append(prob(p, means, covs_diags, res))
    z = np.array(z).reshape(x.shape)

    plt.contourf(x, y, z)
    plt.title(f"Digit {digit_num}")
    plt.xlabel("MFCC 1")
    plt.ylabel("MFCC 2")
    plt.show()

In [None]:
for i in range(10):
    diag_cov(train_df_split, 4, i, 0, 1)

ML Classification (k-means):
=

In [None]:
# add labels to train
train_df_label = train_df_split
train_df_label['label'] = [0]*len(train_df_split)

size = 660
for i in range(len(train_df_label)):
    value = train_df_label.at[i, 'group_no']
    for digit_num in range(10):
        if ((value >= size*digit_num) & (value < (size*(digit_num+1)))) :
            train_df_label.at[i, 'label'] = digit_num

In [None]:
# add labels to test
test_df_label = test_df_split
test_df_label['label'] = [0]*len(test_df_split)

size = 220
for i in range(len(test_df_label)):
    value = test_df_label.at[i, 'group_no']
    for digit_num in range(10):
        if ((value >= size*digit_num) & (value < (size*(digit_num+1)))) :
            test_df_label.at[i, 'label'] = digit_num

In [None]:
test_data = test_df_split.iloc[:, 0:13]

In [None]:
# explore changing # clusters

kmeans_0 = get_kmeans_model(train_df_split, 4, 0)
kmeans_1 = get_kmeans_model(train_df_split, 4, 1)
kmeans_2 = get_kmeans_model(train_df_split, 4, 2)
kmeans_2_2 = get_kmeans_model(train_df_split, 2, 2) # changed 2 to 2 clusters
kmeans_3 = get_kmeans_model(train_df_split, 4, 3)
kmeans_4 = get_kmeans_model(train_df_split, 4, 4)
kmeans_5 = get_kmeans_model(train_df_split, 4, 5)
kmeans_6 = get_kmeans_model(train_df_split, 4, 6)
kmeans_7 = get_kmeans_model(train_df_split, 4, 7)
kmeans_7_2 = get_kmeans_model(train_df_split, 2, 7) # changed 7 to 2 clusters
kmeans_8 = get_kmeans_model(train_df_split, 4, 8)
kmeans_8_3 = get_kmeans_model(train_df_split, 3, 8) # changed 8 to 3 clusters
kmeans_9 = get_kmeans_model(train_df_split, 4, 9)

In [None]:
kmeans_models = [kmeans_0, kmeans_1, kmeans_2, kmeans_3, kmeans_4, kmeans_5, kmeans_6, kmeans_7, kmeans_8, kmeans_9]

test_data = test_df_split.iloc[:, 0:13]
correct_labels = test_df_label['label']
label_true = list((test_df_label.groupby('group_no').mean()['label']))

In [None]:
def calculate_cov_mean_res(data, kmeans):
    num_clusters = kmeans.cluster_centers_.shape[0]
    pred = kmeans.labels_
    
    res = []
    means = []
    covars = []
    for i in range(num_clusters):
        pts = data[pred == i]
        means.append(np.mean(pts, axis = 0))
        covars.append(np.cov(pts.T))
        res.append(len(pts)/len(pred))
    
    return means, covars, res

In [None]:
def prob_test(x, means, covs, res, num_clusters):
    ans = 0
    for i in range(num_clusters):
        ans += res[i] * multivariate_normal.pdf(x, means[i], covs[i])
    
    return np.prod(ans)

In [None]:
def predict_label(block_data, kmeans_models):

    score_list = []
    for i in range(10):
        kmeans = kmeans_models[i]
        train_data = get_digit_data(train_df_split, i)
        means, covs, res = calculate_cov_mean_res(train_data, kmeans) #GMM already has these -- change for EM
        score = prob_test(block_data, means, covs, res, kmeans.cluster_centers_.shape[0])
        score_list.append(score)

    label_predict = score_list.index(max(score_list))
    return label_predict

In [None]:
predictions = []
for i in tqdm(range(2200)):
    block = get_1block_data(test_df_split, block_number = i)
    predictions.append(predict_label(block, kmeans_models))
    

In [None]:
kmeans_models_2 = [kmeans_0, kmeans_1, kmeans_2_2, kmeans_3, kmeans_4, kmeans_5, kmeans_6, kmeans_7_2, kmeans_8_3, kmeans_9]


In [None]:
predictions_custom_clus = [] # changing # of clusters for 2, 7, and 8
for i in tqdm(range(2200)):
    block = get_1block_data(test_df_split, block_number = i)
    predictions_custom_clus.append(predict_label(block, kmeans_models_2))

In [None]:
plot_confusion_get_metrics(label_true, predictions_custom_clus, "K-Means, tuned clusters")

In [None]:
kmeans_new = []
for i in range(10):
    if i == 2:
        num_clusters = 3
    elif (i == 4) or (i == 8) or (i == 9) or (i == 7):
        num_clusters = 5
    else:
        num_clusters = 4
    kmeans_current = get_kmeans_model(train_df_split, num_clusters, i)
    kmeans_new.append(kmeans_current)

In [None]:
predictions_new = []
for i in tqdm(range(2200)):
    block = get_1block_data(test_df_split, block_number = i)
    predictions_new.append(predict_label(block, kmeans_new))
    

## Elbow plot to determine # of clusters

In [None]:
def plot_elbows(data, digit_num):
    distortions = []
    inertias = []
    mapping1 = {}
    mapping2 = {}
    K = range(1, 10)

    # data = get_digit_data(train_df_split, digit_num)
  
    for k in K:
        # Building and fitting the model
        kmeanModel = KMeans(n_clusters=k)
        kmeanModel.fit(data)
  
        distortions.append(sum(np.min(cdist(data, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / data.shape[0])
        inertias.append(kmeanModel.inertia_)
  
        mapping1[k] = sum(np.min(cdist(data, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / data.shape[0]
        mapping2[k] = kmeanModel.inertia_
        
    #plt.figure(figsize=(10, 4))
    plt.plot(K, distortions, 'bx-')
    plt.plot()
    plt.xlabel('Number of Clusters')
    plt.ylabel('Distortion')
    plt.title(f'Digit {digit_num}')
    plt.show()

In [None]:
#Title: Elbow Method for Optimal Number of Clusters
for i in range(6,10):
    data = get_digit_data(train_df_split, i)
    plot_elbows(data, i)

In [None]:
plot_elbows(get_digit_data(train_df_split, 5), 5)

## Explore KMeans classification for Gender split data

In [None]:
kmeans_male_models = []

for i in range(10):
    digit_data = males_train_all[males_train_all['label'] == i].iloc[:, 0:13] 
    kmeans_male = KMeans(n_clusters = 4).fit(digit_data)
    kmeans_male_models.append(kmeans_male)

In [None]:
kmeans_female_models = []

for i in range(10):
    kmeans_female = get_kmeans_model(females_train_all, 4, i)
    kmeans_female_models.append(kmeans_female)

In [None]:
test_data_male = males_test.iloc[:, 0:13]
label_true_male = list((males_test.groupby('group_no').mean()['label']))

test_data_female = females_test.iloc[:, 0:13]
label_true_female = list((females_test.groupby('group_no').mean()['label']))

In [None]:
def predict_label_male(block_data, all_train_data, kmeans_models):

    score_list = []
    for i in range(10):
        kmeans = kmeans_male_models[i]
        train_data = all_train_data[all_train_data['label'] == i].iloc[:, 0:13] 
        means, covs, res = calculate_cov_mean_res(train_data, kmeans) 
        score = prob_test(block_data, means, covs, res, kmeans.cluster_centers_.shape[0])
        score_list.append(score)

    label_predict = score_list.index(max(score_list))
    return label_predict

In [None]:
predictions_male = []
for i in tqdm(range(1100)):
    block = get_1block_data(males_test, block_number = i)
    predictions_male.append(predict_label_male(block, males_train_all, kmeans_male_models))
    

In [None]:
accuracy_male = (np.sum(np.array(label_true_male) == np.array(predictions_male)) / len(label_true_male))*100
print(f"The accuracy of the k-means model on the male test set is {accuracy_male}.")

In [None]:
## CAN USE SAME FNCTN, GEN

def predict_label_female(block_data, kmeans_models):

    score_list = []
    for i in range(10):
        kmeans = kmeans_female_models[i]
        train_data = get_digit_data(females_train_all, i)
        means, covs, res = calculate_cov_mean_res(train_data, kmeans) #GMM already has these -- change for EM
        score = prob_test(block_data, means, covs, res, kmeans.cluster_centers_.shape[0])
        score_list.append(score)

    label_predict = score_list.index(max(score_list))
    return label_predict

In [None]:
predictions_female = []
for i in tqdm(range(1100)):
    block = get_1block_data(females_test, block_number = i)
    predictions_female.append(predict_label_female(block, kmeans_female_models))
    

## Explore EM classification for gender split data

In [None]:
# find if #clusters changes

for i in range(10):
    data = females_train_all[females_train_all["label"] == i]
    plot_elbows(data, i)

In [None]:
male_em_models = []

for i in range(10):
    digit_data = males_train_all[males_train_all["label"] == i].iloc[:, 0:13]
    em_current_male = GaussianMixture(n_components = 4, max_iter = 10000000).fit(digit_data)
    male_em_models.append(em_current_male)

In [None]:
def predict_label_em_gen(block_data, em_models):

    score_list = []
    for i in range(10):
        em = em_models[i]
        means = em.means_ 
        covs = em.covariances_
        res =  em.weights_ 
        num_clusters = res.shape[0]
        score = prob_test(block_data, means, covs, res, num_clusters) 
        score_list.append(score)

    label_predict = score_list.index(max(score_list))
    return label_predict

In [None]:
males_blocks = males_test['group_no']
males_blocks_unique = np.unique(males_blocks)

type(males_blocks)

In [None]:
predictions_male_new = []

#use group_no from males and females_test to index
for i in tqdm(males_blocks_unique): #it's every other 
    block = get_1block_data(males_test, block_number = i)
    predictions_male_new.append(predict_label_em_gen(block, male_em_models))

In [None]:
male_label_true = list((males_test.groupby('group_no').mean()['label'])) 

In [None]:
plot_confusion_get_metrics(male_label_true, predictions_male_new, "EM, Males", "split")

In [None]:
female_em_models = []
for i in range(10):
    digit_data = females_train_all[females_train_all["label"] == i].iloc[:, 0:13]
    em_current_female = GaussianMixture(n_components = 4, max_iter = 10000000).fit(digit_data)
    female_em_models.append(em_current_female)

females_blocks = females_test['group_no']
females_blocks_unique = np.unique(females_blocks)

female_label_true = list((females_test.groupby('group_no').mean()['label'])) 

In [None]:
predictions_female_new = []

#use group_no from males and females_test to index
for i in tqdm(females_blocks_unique): #it's every other 
    block = get_1block_data(females_test, block_number = i)
    predictions_female_new.append(predict_label_em_gen(block, female_em_models))

## PCA exploration

In [None]:
pca = PCA(n_components=4)
X = get_digit_data(train_df_split, 0)
pca.fit(X)

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA

pca = PCA()
components = pca.fit_transform(train_df_split.iloc[:, :13])
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(2),
    color=train_df_split["label"]
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
pca = PCA(n_components=13)
pca.fit(train_df_split.iloc[:, :13])
pca

In [None]:
pca = PCA(2)  # project from 64 to 2 dimensions
projected = pca.fit_transform(train_df_split.iloc[:, :13])
print(train_df_split.iloc[:, :13].shape)
print(projected.shape)

In [None]:
plt.figure(figsize=(40, 20))
plt.scatter(projected[:, 0], projected[:, 1],
            c=train_df_split.label, edgecolor='none', alpha=0.5, cmap = 'tab10')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.title("PCA on entire dataset")
plt.colorbar();

ML Classification (EM):
=

In [None]:
em_models_old = []
for i in range(10):
    current = get_em_model(train_df_split, 4, i, 1, 2)
    em_models_old.append(current)

In [None]:
predictions_em_old = []
for i in tqdm(range(2200)):
    block = get_1block_data(test_df_split, block_number = i).iloc[:, 0:2] #just use 1st 2 coeff -- CHANGE all 13
    predictions_em_old.append(predict_label_em(block, em_models_new))

In [None]:
em_models_new = []
for i in range(10):
    if (i == 1) or (i == 2) or (i == 4): 
        num_clusters = 3
    elif (i == 8) or (i == 9):
        num_clusters = 5
    else:
        num_clusters = 4
    em_current = get_em_model_new(train_df_split, num_clusters, i)
    em_models_new.append(em_current)

In [None]:
def predict_label_em(block_data, em_models):

    score_list = []
    for i in range(10):
        em = em_models[i]
        means = em.means_ 
        covs = em.covariances_
        res =  em.weights_ 
        num_clusters = res.shape[0]
        score = prob_test(block_data, means, covs, res, num_clusters) 
        score_list.append(score)

    label_predict = score_list.index(max(score_list))
    return label_predict

In [None]:
predictions_em = []
for i in tqdm(range(2200)):
    block = get_1block_data(test_df_split, block_number = i).iloc[:, 0:13] 
    predictions_em.append(predict_label_em(block, em_models_new))

In [None]:
predictions_em_all = []
for i in tqdm(range(2200)):
    block = get_1block_data(test_df_split, block_number = i).iloc[:, 0:13] 
    predictions_em_all.append(predict_label_em_gen(block, em_models))

Test Performance Metrics:
=

In [None]:
tp = np.diag(conf_mat)
prec = np.average(list(map(truediv, tp, np.sum(conf_mat, axis=0))))
rec = np.average(list(map(truediv, tp, np.sum(conf_mat, axis=1))))
f1 = 2*((prec*rec)/(prec+rec))
print ('Precision: {}\nRecall: {}\nF1: {}'.format(prec, rec, f1))

In [None]:
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i,y[i],y[i])

In [None]:
def plot_confusion_get_metrics(true_label, preds, title, data_type = "full"):
    conf_mat = confusion_matrix(true_label, preds)
    plt.figure(figsize=(11, 8))
    sns.heatmap(conf_mat, cmap='RdBu', annot=True, fmt='g')
    plt.title(f"Predicted vs. True labels ({title})")
    plt.xlabel("True Digits")
    plt.ylabel("Predicted Digits")
    plt.show()
    
    accuracy = (np.sum(np.array(true_label) == np.array(preds)) / len(true_label))*100
    
    tp = np.diag(conf_mat)
    prec = np.average(list(map(truediv, tp, np.sum(conf_mat, axis=0))))
    rec = np.average(list(map(truediv, tp, np.sum(conf_mat, axis=1))))
    f1 = 2*((prec*rec)/(prec+rec))
    print ('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1: {}'.format(accuracy, prec, rec, f1))
    
    if data_type == "split":
        tot = 110
    else:
        tot = 220
    
    digit_accuracies = []
    for i in range(10):
        print(conf_mat[i][i])
        digit_accuracies.append((conf_mat[i][i] / tot) * 100 )
    
    print(digit_accuracies)
    
    plt.bar(range(10), digit_accuracies)
    
    for i in range(10):
        plt.text(i, round(digit_accuracies[i], 1), round(digit_accuracies[i],1), ha = 'center')
    
    plt.title("Accuracy per Digit")
    plt.xticks(np.arange(0, 10, 1))
    plt.xlabel("Digit")
    plt.ylabel("Accuracy (%)")
    plt.show()

In [None]:
plot_confusion_get_metrics(label_true, predictions_em, "EM, tuned clusters")

In [None]:
plot_confusion_get_metrics(female_label_true, predictions_female_new, "EM, female", "split")

In [None]:
plot_confusion_get_metrics(male_label_true, predictions_male_new, "EM, male")

## SLIDEOC


$$ \huge J = \Sigma_{i=1}^{k} \Sigma_{j=1}^{n} \| x_i - c_j \|^2 $$    

$$\huge \hat{\theta} = argmaxlog(\Sigma_{z} p(x, z | \theta) )$$    

$$ \huge max(log (\Sigma_{k=1}^{K} r_k p (x_i | \mu_k, \Sigma_k )) $$

$$\huge L(X|f) = \Pi_{i=1}^{F} (\Sigma_{k=1}^{K} r_k p (x_i | \mu_k, \Sigma_k ))$$

$$\huge L(X|f) = \frac{1}{F} \Sigma_{i=1}^{F} log (\Sigma_{k=1}^{K} r_k p (x_i | \mu_k, \Sigma_k ))$$

$$ \huge P(x|\lambda) = \Sigma_{k=1}^{M} \pi_k \mathcal{N}(x|\mu_k, \Sigma_{k}) $$

$$\frac{TP}{TP + FN} $$

$$\frac{2 * precision * recall}{precision + recall} $$

$$ \begin{bmatrix}
\sigma_i^2 & 0\\
0 & \sigma_j^2
\end{bmatrix} $$