In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

#dataset: http://archive.ics.uci.edu/ml/datasets/Forest+Fires
#0-5: X, Y, month, day, FFMC, DMC
#6-11: DC, ISI, temp, RH, wind, rain
#12: area

In [2]:
#Load and edit datasets
#Load Forest Fire
file_dir_ff = "/Users/dantongzhu/Documents/Spring 2019/Machine Learning/project 3/forestfires.csv"
df_ff = pd.read_csv(file_dir_ff, header=None)
#Load WIne Quality
file_dir_wq = "/Users/dantongzhu/Documents/Spring 2019/Machine Learning/project 3/winequality-red.csv"
df_wq = pd.read_csv(file_dir_wq, sep = ';', header = None)

#Edit Forest Fire
df_ff.columns = df_ff.iloc[0]
df_ff = df_ff.drop([0])
map_month = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6, 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
df_ff['month'] = df_ff['month'].map(map_month)
map_day = {'mon':1, 'tue':2, 'wed':3, 'thu':4, 'fri':5, 'sat':6, 'sun':7}
df_ff['day'] = df_ff['day'].map(map_day)
df_ff['area'] = pd.to_numeric(df_ff['area'])
df_ff['label'] = (df_ff['area'] > 5).astype(int)  #1 if area > 5, 0 otherwise
X_ff = df_ff.loc[:,'X':'rain']
labels_true_ff = df_ff['label'].values
#Edit Wine Quality
df_wq.columns = df_wq.iloc[0]
df_wq = df_wq.drop([0])
df_wq['label'] = df_wq['quality']
X_wq = df_wq.loc[:,'fixed acidity':'alcohol']
labels_true_wq = df_wq['label'].values

#Two datasets together
dataset_names = ["Forest Fire", "Wine Quality"]
df_set = [df_ff, df_wq]
X_set = [X_ff, X_wq]
labels_true_set = [labels_true_ff, labels_true_wq]

In [4]:
#Task 1
#Clustering: K means and Expectation Maximization
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn import metrics

k = 5 #number of trials
n_range = range(1,41)

ARI_kmeans_array_list = []
MI_kmeans_array_list = []
ARI_em_array_list = []
MI_em_array_list = []

for j in range(0, 2):
    df = df_set[j]
    X = X_set[j]
    labels_true = labels_true_set[j]

    ARI_kmeans_array = np.array([])
    MI_kmeans_array = np.array([])
    ARI_em_array = np.array([])
    MI_em_array = np.array([])

    for n in n_range:
        tot_ARI_kmeans = 0
        tot_MI_kmeans = 0
        tot_ARI_em = 0
        tot_MI_em = 0
        
        for i in range(0, k):
            #train
            kmeans = KMeans(n_clusters=n, random_state=0).fit(X)
            em =  GaussianMixture(n_components= n, max_iter = 500, random_state=0).fit(X)
            #predict
            labels_pred_kmeans = kmeans.predict(X)
            labels_pred_em = em.predict(X)
            #evaluation
            ARI_kmeans = metrics.adjusted_rand_score(labels_true, labels_pred_kmeans)
            MI_kmeans = metrics.adjusted_mutual_info_score(labels_true, labels_pred_kmeans, average_method='arithmetic')
            ARI_em = metrics.adjusted_rand_score(labels_true, labels_pred_em)
            MI_em = metrics.adjusted_mutual_info_score(labels_true, labels_pred_em, average_method='arithmetic')

            tot_ARI_kmeans = tot_ARI_kmeans + ARI_kmeans
            tot_MI_kmeans = tot_MI_kmeans + MI_kmeans
            tot_ARI_em = tot_ARI_em + ARI_em
            tot_MI_em = tot_MI_em + MI_em

        ARI_kmeans_array = np.append(ARI_kmeans_array, tot_ARI_kmeans/k)
        MI_kmeans_array = np.append(MI_kmeans_array, tot_MI_kmeans/k)
        ARI_em_array = np.append(ARI_em_array, tot_ARI_em/k)
        MI_em_array = np.append(MI_em_array, tot_MI_em/k)
        print ("dataset " + str(j+1) + ": n = " + str(n) + ", ",
               str(tot_ARI_kmeans/k), str(tot_MI_kmeans/k), str(tot_ARI_em/k), str(tot_MI_em/k))
        
    ARI_kmeans_array_list.append(ARI_kmeans_array)
    MI_kmeans_array_list.append(MI_kmeans_array)
    ARI_em_array_list.append(ARI_em_array)
    MI_em_array_list.append(MI_em_array)

dataset 1: n = 1,  0.0 0.0 0.0 0.0
dataset 1: n = 2,  0.0011941418684816418 -0.001659052871171824 -0.0053793634539512185 -0.002166085291688866
dataset 1: n = 3,  -0.0023073416831620487 -0.00037618706696917293 -0.010229800298327766 -0.0022476578027251647
dataset 1: n = 4,  0.0001389404340863122 -0.0018727446625084597 0.007137633873153912 -0.001848144544056776
dataset 1: n = 5,  0.0011956582928114945 0.0002919385152548323 0.0004791698950912424 -0.0002880562549549173
dataset 1: n = 6,  0.0012129080030807014 0.00013227881325848507 0.003021108216271848 -0.002258191094315635
dataset 1: n = 7,  0.0026409855571749067 -0.0013014659650810113 0.005390775243614017 0.0009585991124069028
dataset 1: n = 8,  0.00011469340484574985 -0.0020566604505987364 0.006423783057297361 -0.00026323298198213234
dataset 1: n = 9,  0.0009236429427588837 -0.001940647508285803 -0.0001849449553925356 0.001090828399093662
dataset 1: n = 10,  -4.501816322117948e-06 -0.0024721524066315275 0.0005974551950105497 -0.002674003

dataset 2: n = 40,  -0.0007038575819627548 0.029350155058108666 0.010735400790019337 0.04486893655200981


In [5]:
#Task 1 continue
#Clustering Plot
%matplotlib qt
#%matplotlib inline

for j in range(0, 2):
    ARI_kmeans_array = ARI_kmeans_array_list[j]
    MI_kmeans_array = MI_kmeans_array_list[j]
    ARI_em_array = ARI_em_array_list[j]
    MI_em_array = MI_em_array_list[j]
    dataset_name = dataset_names[j]

    plt.figure()
    plt.title(dataset_name + " Clustering")
    plt.xlabel("Number of Clusters")
    plt.ylabel("Evaluation of Performance")
    
    plt.plot(n_range, ARI_kmeans_array, marker='', linewidth=2, color = "red", label="K means ARI")
    plt.plot(n_range, MI_kmeans_array, marker='', linestyle='dashed', linewidth=2, color = "red", label="K means MI")
    plt.plot(n_range, ARI_em_array, marker='', linewidth=2, color = "blue", label="EM ARI")
    plt.plot(n_range, MI_em_array, marker='', linestyle='dashed', linewidth=2, color = "blue", label="EM MI")

    plt.legend()
plt.show()

#based on the result, we will choose to do 5 clusters for both datasets for the rest of the experiments 


In [8]:
#Task 1 continue
#Plot Histogram of Clustering Results
%matplotlib qt

n = 5 #number of clusters for both datasets
for j in range(0, 2):
    df = df_set[j]
    X = X_set[j]
    labels_true = labels_true_set[j]
    dataset_name = dataset_names[j]

    #train
    kmeans = KMeans(n_clusters=n, random_state=0).fit(X)
    em =  GaussianMixture(n_components= n, max_iter = 500, random_state=0).fit(X)
    #predict
    labels_pred_kmeans = kmeans.predict(X)
    labels_pred_em = em.predict(X)
    
    #count occurance
    #a = numpy.array([0, 3, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 3, 4])
    unique_KM, counts_KM = np.unique(labels_pred_kmeans, return_counts=True)
    unique_EM, counts_EM = np.unique(labels_pred_em, return_counts=True)
    print ("K means prediction distribution: " + str(dict(zip(unique_KM, counts_KM))))
    print ("EM prediction distribution: " + str(dict(zip(unique_EM, counts_EM))))
    
    #evaluate
    ARI_kmeans = metrics.adjusted_rand_score(labels_true, labels_pred_kmeans)
    MI_kmeans = metrics.adjusted_mutual_info_score(labels_true, labels_pred_kmeans, average_method='arithmetic')
    ARI_em = metrics.adjusted_rand_score(labels_true, labels_pred_em)
    MI_em = metrics.adjusted_mutual_info_score(labels_true, labels_pred_em, average_method='arithmetic')
    #print evaluation
    print (dataset_name + " n = 5 clustering")
    print ("K means ARI = " + str(ARI_kmeans) + ", MI = " + str(MI_kmeans))
    print ("EM ARI = " + str(ARI_em) + ", MI = " + str(MI_em))
    
    #plot k means
    plt.figure()
    plt.title(dataset_name + " K means Histogram, n = " + str(n))
    plt.hist(labels_pred_kmeans, rwidth=0.8, bins=n)
    plt.xlabel("Components")

    #plot EM
    plt.figure()
    plt.title(dataset_name + " Expectation Maximization Histogram, n = " + str(n))
    plt.hist(labels_pred_em, rwidth=0.8, bins=n)
    plt.xlabel("Components")
plt.show()


K means prediction distribution: {0: 134, 1: 89, 2: 190, 3: 60, 4: 44}
EM prediction distribution: {0: 12, 1: 86, 2: 163, 3: 213, 4: 43}
Forest Fire n = 5 clustering
K means ARI = 0.0011956582928114945, MI = 0.0002919385152548323
EM ARI = 0.0004791698950912424, MI = -0.0002880562549549173
K means prediction distribution: {0: 69, 1: 450, 2: 298, 3: 184, 4: 598}
EM prediction distribution: {0: 39, 1: 812, 2: 160, 3: 241, 4: 347}
Wine Quality n = 5 clustering
K means ARI = -0.004797817360311893, MI = 0.03280432579878591
EM ARI = 0.025959324504488814, MI = 0.026324496126219288


In [3]:
#Task 2
#PCA and eigenvalues, plot histogram
from sklearn.decomposition import PCA
%matplotlib qt

for j in range(0, 2):
    df = df_set[j]
    X = X_set[j]
    labels_true = labels_true_set[j]
    dataset_name = dataset_names[j]
    
    #get PCA
    pca = PCA()
    pca_compos = pca.fit_transform(X)
    pca_X = pd.DataFrame(data = pca_compos)
    #compute eigenvalues
    eigenvalues = pca.explained_variance_
    print (dataset_name + " PCA Eigenvalues")
    print (eigenvalues)
    #plot (log of) eigenvalues
    eigenvalues_log = np.log10(eigenvalues) 
    plt.figure()
    plt.title("PCA Eigenvalues: " + dataset_name)
    plt.xlabel("Components")
    plt.ylabel("Log of Eigenvalues")
    plt.bar(range(1, len(eigenvalues)+1), eigenvalues_log)

#Result:
#FF may need more target components for running PCA than WQ, because lots of WQ eigenvalues are low

Forest Fire PCA Eigenvalues
[6.35315829e+04 2.13360020e+03 2.74912192e+02 3.01035993e+01
 1.47925583e+01 9.92633332e+00 5.79544765e+00 4.12721416e+00
 2.79215516e+00 1.08229112e+00 8.67634461e-01 8.29958565e-02]
Wine Quality PCA Eigenvalues
[1.13380708e+03 5.79354108e+01 3.10130228e+00 1.81941532e+00
 1.04634036e+00 4.13967294e-02 2.31926578e-02 1.13464685e-02
 1.00779841e-02 1.45499755e-03 5.61482667e-07]


In [4]:
#Task 2: ICA and kurtosis, plot histogram
from sklearn.decomposition import FastICA
from scipy.stats import kurtosis
%matplotlib qt

k = 5  #number of trials
for j in range(0, 2):
    df = df_set[j]
    X = X_set[j]
    labels_true = labels_true_set[j]
    dataset_name = dataset_names[j]
    #ICA
    tot_kurt_mag = 0
    tot_count = 0
    for i in range(0, k):
        ica = FastICA(max_iter = 500)
        ica_compos = ica.fit_transform(X)
        ica_X = pd.DataFrame(data = ica_compos)
        #Compute Kurtosis
        kurt_arr = []
        count = 0
        for col in ica_X:
            kurt = kurtosis(ica_X[col])
            kurt_arr.append(kurt)
            if (kurt > 3):
                count = count+1
        kurt_vector = np.array(kurt_arr)
        kurt_mag = np.sum(kurt_vector*2)
        tot_kurt_mag = tot_kurt_mag + kurt_mag
        tot_count = tot_count + count
    kurt_mag = tot_kurt_mag/k
    count = tot_count/k
    print (dataset_name + ":")
    print ("kurtosis = " + str(kurt_mag))
    print (kurt_arr)
    print ("count = " + str(count) + " out of " + str(len(df.columns)-2))
    
    plt.figure()
    plt.title("ICA Kurtosis: " + dataset_name)
    plt.xlabel("Components")
    plt.ylabel("Kurtosis")
    plt.axhline(y=3, color = "red")
    plt.bar(range(1, len(ica_X.columns)+1), kurt_arr)
    




Forest Fire:
kurtosis = 1189.9755756296656
[3.458816837467751, -0.2576685563859691, 2.808314786706574, 1.1190231492582132, 1.058504361767504, 42.08059881043827, 0.6468491009038964, -1.2031347289122774, 24.420225547202126, 420.67888774953224, 103.5708774640647, -1.1812068710951127]
count = 5.0 out of 12
Wine Quality:
kurtosis = 241.38689681548203
[4.509727895082629, 19.37073115243356, 6.830016151734096, 33.96013299181132, 2.3832310936921512, 45.144666526612745, 2.5098494255747363, 3.4007241682839098, -0.975834769436859, 3.092727537658525, -0.30589303998762585]
count = 7.0 out of 11


In [17]:
#Task 2: Random Projection
from sklearn import random_projection
%matplotlib qt

num_trials = [0,1, 10, 20, 30, 40, 50, 75, 100, 125, 150, 175, 200]
for j in range(0, 2):
    df = df_set[j]
    X = X_set[j]
    labels_true = labels_true_set[j]
    dataset_name = dataset_names[j]
    
    curr_X = X
    var_mag_arr = []
    count = 0
    while count <= num_trials[len(num_trials)-1]:
        #calculate variance
        if count in num_trials:
            if (count == 0):
                rp_X = curr_X
            var_arr = np.array([])
            for col in rp_X:
                col_arr = np.array(rp_X[col].astype(float).values)
                var = np.var(col_arr)
                var_arr = np.append(var_arr, var)
            var_mag = np.sum(var_arr*2)
            var_mag_arr.append(var_mag)
            print (dataset_name, count, var_mag)
        #run RP
        rp = random_projection.GaussianRandomProjection(n_components = len(X.columns))
        rp_compos = rp.fit_transform(curr_X)     
        rp_X = pd.DataFrame(data = rp_compos)
        curr_X = rp_X
        count = count + 1
    #plot
    plt.figure()
    plt.title(dataset_name + " Random Projection Variance Over Time")
    plt.xlabel("Number of Trials")
    plt.ylabel("Variance")
    plt.plot(num_trials, var_mag_arr, marker='', linewidth=2)
plt.show()

Forest Fire 0 131763.97448005716
Forest Fire 1 176893.92250487593
Forest Fire 10 21923.546198413398
Forest Fire 20 5513.346257572894
Forest Fire 30 1462.8177017311052
Forest Fire 40 23.221361862678602
Forest Fire 50 52.89315894265413
Forest Fire 75 2.2543495574599133
Forest Fire 100 0.5293795769844023
Forest Fire 125 0.09390801964406721
Forest Fire 150 9.268080737733995e-05
Forest Fire 175 1.4169013759224883e-06
Forest Fire 200 1.302637319538649e-07
Wine Quality 0 2394.0958446583386
Wine Quality 1 2284.731900709841
Wine Quality 10 4502.03833465898
Wine Quality 20 2373.474862358412
Wine Quality 30 1905.2351857025292
Wine Quality 40 1544.2778823128933
Wine Quality 50 134.54802186547704
Wine Quality 75 1732.9953728641742
Wine Quality 100 653.2018375569518
Wine Quality 125 6.357297377119459
Wine Quality 150 10.261169396945634
Wine Quality 175 17.91402587224002
Wine Quality 200 38.72166865549826


In [27]:
#Task 3
#Clustering experiments on the data after dimensionality reduction
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn import random_projection
from sklearn import metrics
from scipy.stats import kurtosis
%matplotlib qt

PCA_dim_set = [6, 4]  #Target dimensions for PCA results, according to the eigenvalues results in Task 2
RP_runs = [10, 10]

pca_X_set = []
ica_X_set = []
rp_X_set = []

for j in range(0, 2):
    df = df_set[j]
    X = X_set[j]
    labels_true = labels_true_set[j]
    dataset_name = dataset_names[j]
    
    #Generate the new features
    #PCA
    pca = PCA(n_components = PCA_dim_set[j])
    pca_compos = pca.fit_transform(X)
    pca_X = pd.DataFrame(data = pca_compos)
    pca_X_set.append(pca_X)
    
    #ICA   
    ica = FastICA(max_iter = 500)
    ica_compos = ica.fit_transform(X)
    ica_X = pd.DataFrame(data = ica_compos)
    #Delte new features that have abs(kurtosis) <= 3
    to_delete = []
    for col in ica_X.columns:
        if kurtosis(ica_X[col]) <= 3:
            to_delete.append(col)
    ica_X = ica_X.drop(to_delete, axis=1)
    ica_X_set.append(ica_X)
    
    #RP
    count = 0
    curr_X = X
    while count < RP_runs[j]:
        rp = random_projection.GaussianRandomProjection(n_components = len(X.columns))
        rp_compos = rp.fit_transform(curr_X)     
        rp_X = pd.DataFrame(data = rp_compos)
        curr_X = rp_X
        count = count + 1
    rp_X_set.append(rp_X)

In [28]:
#Task 3 continue
#Compare clustering results on the new features and plot
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn import metrics
%matplotlib qt

dim_red_names = ["PCA", "ICA", "RP"]
n_range = range(1,21)

for j in range(0, 2):
    df = df_set[j]
    X = X_set[j]
    labels_true = labels_true_set[j]
    dataset_name = dataset_names[j]
    print ("Dataset " + str(j+1) + ": " + dataset_name)
    
    new_X_set = [pca_X_set[j], ica_X_set[j], rp_X_set[j]]
    ARI_kmeans_new_raw_PCA, ARI_kmeans_new_raw_ICA, ARI_kmeans_new_raw_RP = ([], [], [])
    ARI_kmeans_new_true_PCA, ARI_kmeans_new_true_ICA, ARI_kmeans_new_true_RP = ([], [], [])
    ARI_em_new_raw_PCA, ARI_em_new_raw_ICA, ARI_em_new_raw_RP = ([], [], [])
    ARI_em_new_true_PCA, ARI_em_new_true_ICA, ARI_em_new_true_RP = ([], [], [])
    ARI_kmeans_raw_true_arr, ARI_em_raw_true_arr = ([], [])
    
    for n in n_range:  #n = number of clusters
        #clustering on raw data
        kmeans = KMeans(n_clusters=n, random_state=0).fit(X)
        em =  GaussianMixture(n_components= n, max_iter = 500, random_state=0).fit(X)
        labels_pred_kmeans_raw = kmeans.predict(X)
        labels_pred_em_raw = em.predict(X)
        #ARI: raw vs true
        ARI_kmeans_raw_true = metrics.adjusted_rand_score(labels_pred_kmeans_raw, labels_true)
        ARI_em_raw_true = metrics.adjusted_rand_score(labels_pred_em_raw, labels_true)
        ARI_kmeans_raw_true_arr.append(ARI_kmeans_raw_true)
        ARI_em_raw_true_arr.append(ARI_em_raw_true)
        
        #clustering on new data
        for m in range(0, len(dim_red_names)):
            #define the new features and the name
            new_X = new_X_set[m]
            dim_red_name = dim_red_names[m]
            
            #Run K means and EM
            kmeans = KMeans(n_clusters=n, random_state=0).fit(new_X)
            em =  GaussianMixture(n_components= n, max_iter = 500, random_state=0).fit(new_X)
            labels_pred_kmeans_new = kmeans.predict(new_X)
            labels_pred_em_new = em.predict(new_X)
            
            #Compute ARI values
            #new vs old
            ARI_kmeans_new_raw = metrics.adjusted_rand_score(labels_pred_kmeans_new, labels_pred_kmeans_raw)
            ARI_em_new_raw = metrics.adjusted_rand_score(labels_pred_em_new, labels_pred_em_raw)
            #new vs true
            ARI_kmeans_new_true = metrics.adjusted_rand_score(labels_pred_kmeans_new, labels_true)
            ARI_em_new_true = metrics.adjusted_rand_score(labels_pred_em_new, labels_true)
            if dim_red_name == "PCA":
                ARI_kmeans_new_raw_PCA.append(ARI_kmeans_new_raw)
                ARI_em_new_raw_PCA.append(ARI_em_new_raw)
                ARI_kmeans_new_true_PCA.append(ARI_kmeans_new_true)
                ARI_em_new_true_PCA.append(ARI_em_new_true)
            if dim_red_name == "ICA":
                ARI_kmeans_new_raw_ICA.append(ARI_kmeans_new_raw)
                ARI_em_new_raw_ICA.append(ARI_em_new_raw)
                ARI_kmeans_new_true_ICA.append(ARI_kmeans_new_true)
                ARI_em_new_true_ICA.append(ARI_em_new_true)
            if dim_red_name == "RP":
                ARI_kmeans_new_raw_RP.append(ARI_kmeans_new_raw)
                ARI_em_new_raw_RP.append(ARI_em_new_raw)
                ARI_kmeans_new_true_RP.append(ARI_kmeans_new_true)
                ARI_em_new_true_RP.append(ARI_em_new_true)
        if np.remainder(n, 5) == 0:
            print ("n = " + str(n) + " done")
        
    #plot
    #Compare to the true labeling
    #K means compare to True
    plt.figure()
    plt.title(dataset_name + " Clustering Similarity: Compare to True Labels")
    plt.xlabel("Number of Clusters")
    plt.ylabel("ARI score")
    #k means
    plt.plot(n_range, ARI_kmeans_new_true_PCA, marker='', color = 'b', linewidth=2, label="k means PCA")
    plt.plot(n_range, ARI_kmeans_new_true_ICA, marker='', color = 'm', linewidth=2, label="k means ICA")
    plt.plot(n_range, ARI_kmeans_new_true_RP, marker='', color = 'y', linewidth=2, label="k means RP")
    plt.plot(n_range, ARI_kmeans_raw_true_arr, marker='', color = 'r', linewidth=2, label="k means RAW")
    #EM
    plt.plot(n_range, ARI_em_new_true_PCA, marker='', color = 'b', linestyle = 'dashed', linewidth=2, label="EM PCA")
    plt.plot(n_range, ARI_em_new_true_ICA, marker='', color = 'm', linestyle = 'dashed', linewidth=2, label="EM ICA")
    plt.plot(n_range, ARI_em_new_true_RP, marker='', color = 'y', linestyle = 'dashed', linewidth=2, label="EM RP")
    plt.plot(n_range, ARI_em_raw_true_arr, marker='', color = 'r', linestyle = 'dashed', linewidth=2, label="EM RAW")
    plt.legend()
    
    #K means NEW vs RAW
    plt.figure()
    plt.title(dataset_name + " Clustering Similarity: New Features VS. Raw Features")
    plt.xlabel("Number of Clusters")
    plt.ylabel("ARI score")
    #K means
    plt.plot(n_range, ARI_kmeans_new_raw_PCA, marker='', color = 'b', linewidth=2, label="k means PCA")
    plt.plot(n_range, ARI_kmeans_new_raw_ICA, marker='', color = 'm', linewidth=2, label="k means ICA")
    plt.plot(n_range, ARI_kmeans_new_raw_RP, marker='', color = 'y', linewidth=2, label="k means RP")
    #EM
    plt.plot(n_range, ARI_em_new_raw_PCA, marker='', color = 'b', linestyle = 'dashed', linewidth=2, label="EM PCA")
    plt.plot(n_range, ARI_em_new_raw_ICA, marker='', color = 'm', linestyle = 'dashed', linewidth=2, label="EM ICA")
    plt.plot(n_range, ARI_em_new_raw_RP, marker='', color = 'y', linestyle = 'dashed', linewidth=2, label="EM RP")
    plt.legend()
plt.show()


Dataset 1: Forest Fire
n = 5 done
n = 10 done
n = 15 done
n = 20 done
Dataset 2: Wine Quality
n = 5 done
n = 10 done
n = 15 done
n = 20 done


In [29]:
#Task 4
#Rerun neural network on the newly projected data
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn import random_projection
from scipy.stats import kurtosis

k = 5 #number of trials

#Will focus on the Forest Fire dataset only
df = df_set[0]
X = X_set[0]
labels_true = labels_true_set[0]
dataset_name = dataset_names[0]

#Trials to do for each feature transformation
n_compo_range = range(1, 13)  #target number of components
ICA_kurt_th_range = [-5, -1, 0, 1, 2.5, 3, 100]    #target threshold for kurtosis for each component for ICA
RP_runs_range = [1, 3, 5, 7, 9, 13, 16, 18, 20]     #number of runs for RP

#PCA
pca_X_set = []
for n in n_compo_range:
    pca = PCA(n_components = n)
    pca_compos = pca.fit_transform(X)
    pca_X = pd.DataFrame(data = pca_compos)
    pca_X_set.append(pca_X)
print ("PCA done")

#ICA
ica_X_set = []
n_compo_ICA = []
ica = FastICA(max_iter = 500)
ica_compos = ica.fit_transform(X)
ica_X = pd.DataFrame(data = ica_compos)
for th in ICA_kurt_th_range:
    #Delte new features that have kurtosis <= th
    to_delete = []
    for col in ica_X.columns:
        if kurtosis(ica_X[col]) <= th:
            to_delete.append(col)
    new_X = ica_X.drop(to_delete, axis=1)
    ica_X_set.append(new_X)
    n_compo_ICA.append(len(new_X.columns))
print ("ICA done")

#ICA 2
ica_X_set_2 = []
for n in n_compo_range:
    ica = FastICA(n_components = n, max_iter = 500)
    ica_compos = ica.fit_transform(X)
    ica_X = pd.DataFrame(data = ica_compos)
    ica_X_set_2.append(ica_X)
print ("ICA 2 done")

#RP
rp_X_set = []
count = 0
curr_X = X
while count < RP_runs_range[len(RP_runs_range) - 1]:
    rp = random_projection.GaussianRandomProjection(n_components = len(X.columns))
    rp_compos = rp.fit_transform(curr_X)
    rp_X = pd.DataFrame(data = rp_compos)
    count = count + 1
    if count in RP_runs_range:
        rp_X_set.append(rp_X)
    curr_X = rp_X
print ("RP done")

#RP 2
rp_X_set_2 = []
for n in n_compo_range:
    rp = random_projection.GaussianRandomProjection(n_components = n)
    rp_compos = rp.fit_transform(curr_X)     
    rp_X = pd.DataFrame(data = rp_compos)
    rp_X_set_2.append(rp_X)
print ("RP 2 done")

PCA done




ICA done
ICA 2 done
RP done
RP 2 done


In [30]:
#Task 4 continue
#Run NN
from sklearn.neural_network import MLPClassifier

k = 5  #number of trials

#NN On raw data
tot_accuracy = 0
tot_time = 0
for i in range (0, k):
    start = time.time()
    clf = MLPClassifier(hidden_layer_sizes=(50,)*(10))
    clf.fit(X, labels_true)
    predictions = clf.predict(X)
    delta = abs(predictions - labels_true)
    accuracy = 1 - np.mean(delta)
    original_time = time.time() - start
    tot_accuracy = tot_accuracy + accuracy
    tot_time = tot_time + original_time
original_accuracy = tot_accuracy/k
original_time = tot_time/k
print ("Original accuracy = " + str(original_accuracy) + ", time = " + str(original_time))
print ("")

# NN on PCA results
print ("PCA")
PCA_accu_arr = []
PCA_time_arr = []
for j in range(0, len(pca_X_set)):
    new_X = pca_X_set[j]
    n = n_compo_range[j]
    tot_accuracy = 0
    tot_time = 0
    for i in range (0, k):
        start = time.time()
        clf = MLPClassifier(hidden_layer_sizes=(50,)*(10))
        clf.fit(new_X, labels_true)
        predictions = clf.predict(new_X)
        delta = abs(predictions - labels_true)
        accuracy = 1 - np.mean(delta)
        original_time = time.time() - start
        tot_accuracy = tot_accuracy + accuracy
        tot_time = tot_time + original_time
    avg_accuracy = tot_accuracy/k
    avg_time = tot_time/k
    PCA_accu_arr.append(avg_accuracy)
    PCA_time_arr.append(avg_time)
    print ("n = " + str(n) + " accuracy = " + str(avg_accuracy) + ", time = " + str(avg_time))
print ("")

# NN on ICA results
print ("ICA select kurtosis")
ICA_accu_arr = []
ICA_time_arr = []
for j in range(0, len(ica_X_set)):
    new_X = ica_X_set[j]
    th = ICA_kurt_th_range[j]
    n = len(new_X.columns)
    tot_accuracy = 0
    tot_time = 0
    for i in range (0, k):
        start = time.time()
        clf = MLPClassifier(hidden_layer_sizes=(50,)*(10))
        clf.fit(new_X, labels_true)
        predictions = clf.predict(new_X)
        delta = abs(predictions - labels_true)
        accuracy = 1 - np.mean(delta)
        original_time = time.time() - start
        tot_accuracy = tot_accuracy + accuracy
        tot_time = tot_time + original_time
    avg_accuracy = tot_accuracy/k
    avg_time = tot_time/k
    ICA_accu_arr.append(avg_accuracy)
    ICA_time_arr.append(avg_time)
    print ("th = " + str(th) + ", n = " + str(n) + 
           " accuracy = " + str(avg_accuracy) + ", time = " + str(avg_time))
print ("")

print ("ICA No selection of kurtosis")
ICA_2_accu_arr = []
ICA_2_time_arr = []
for j in range(0, len(ica_X_set_2)):
    new_X = ica_X_set_2[j]
    n = n_compo_range[j]
    tot_accuracy = 0
    tot_time = 0
    for i in range (0, k):
        start = time.time()
        clf = MLPClassifier(hidden_layer_sizes=(50,)*(10))
        clf.fit(new_X, labels_true)
        predictions = clf.predict(new_X)
        delta = abs(predictions - labels_true)
        accuracy = 1 - np.mean(delta)
        original_time = time.time() - start
        tot_accuracy = tot_accuracy + accuracy
        tot_time = tot_time + original_time
    avg_accuracy = tot_accuracy/k
    avg_time = tot_time/k
    ICA_2_accu_arr.append(avg_accuracy)
    ICA_2_time_arr.append(avg_time)
    print ("n = " + str(n) + " accuracy = " + str(avg_accuracy) + ", time = " + str(avg_time))
print ("")


# NN on RP results
RP_accu_arr = []
RP_time_arr = []
print ("RP multiple runs")
for j in range(0, len(rp_X_set)):
    new_X = rp_X_set[j]
    n = RP_runs_range[j]
    tot_accuracy = 0
    tot_time = 0
    for i in range (0, k):
        start = time.time()
        clf = MLPClassifier(hidden_layer_sizes=(50,)*(10))
        clf.fit(new_X, labels_true)
        predictions = clf.predict(new_X)
        delta = abs(predictions - labels_true)
        accuracy = 1 - np.mean(delta)
        original_time = time.time() - start
        tot_accuracy = tot_accuracy + accuracy
        tot_time = tot_time + original_time
    avg_accuracy = tot_accuracy/k
    avg_time = tot_time/k
    RP_accu_arr.append(avg_accuracy)
    RP_time_arr.append(avg_time)
    print ("n = " + str(n) + " accuracy = " + str(avg_accuracy) + ", time = " + str(avg_time))
print ("")

# NN on RP results
print ("RP reduce dimension")
RP_2_accu_arr = []
RP_2_time_arr = []
for j in range(0, len(rp_X_set_2)):
    new_X = rp_X_set_2[j]
    n = n_compo_range[j]
    tot_accuracy = 0
    tot_time = 0
    for i in range (0, k):
        start = time.time()
        clf = MLPClassifier(hidden_layer_sizes=(50,)*(10))
        clf.fit(new_X, labels_true)
        predictions = clf.predict(new_X)
        delta = abs(predictions - labels_true)
        accuracy = 1 - np.mean(delta)
        original_time = time.time() - start
        tot_accuracy = tot_accuracy + accuracy
        tot_time = tot_time + original_time
    avg_accuracy = tot_accuracy/k
    avg_time = tot_time/k
    RP_2_accu_arr.append(avg_accuracy)
    RP_2_time_arr.append(avg_time)
    print ("n = " + str(n) + " accuracy = " + str(avg_accuracy) + ", time = " + str(avg_time))


Original accuracy = 0.7454545454545455, time = 1.5949316024780273

PCA
n = 1 accuracy = 0.7079303675048356, time = 1.0626091003417968
n = 2 accuracy = 0.7206963249516442, time = 1.3513918399810791
n = 3 accuracy = 0.764796905222437, time = 1.9181020736694336
n = 4 accuracy = 0.7783365570599614, time = 1.825807237625122
n = 5 accuracy = 0.755899419729207, time = 1.3001336574554443
n = 6 accuracy = 0.7636363636363637, time = 1.5943079948425294
n = 7 accuracy = 0.790715667311412, time = 1.777907943725586
n = 8 accuracy = 0.7767891682785301, time = 1.7524633407592773
n = 9 accuracy = 0.7895551257253386, time = 1.8776212215423584
n = 10 accuracy = 0.7845261121856867, time = 1.5892150402069092
n = 11 accuracy = 0.7899419729206965, time = 1.6911307334899903
n = 12 accuracy = 0.7733075435203094, time = 1.5095304012298585

ICA select kurtosis
th = -5, n = 12 accuracy = 0.7079303675048356, time = 0.347103214263916
th = -1, n = 10 accuracy = 0.7114119922630561, time = 0.6558905601501465
th = 0, n

In [32]:
#Task 4 continue
#Plot
%matplotlib qt

#Accuracy plot
plt.figure()
plt.title("Accuracy: Neural Network on New Features")
plt.xlabel("Number of Components/Features")
plt.ylabel("Accuracy of Classification")
plt.plot(n_compo_range, PCA_accu_arr, marker='', linewidth=2, label="PCA")
plt.plot(n_compo_range, ICA_2_accu_arr, marker='', linewidth=2, label="ICA")
plt.plot(n_compo_range, RP_2_accu_arr, marker='', linewidth=2, label="RP")
plt.axhline(y=original_accuracy, linestyle='dashed', color = "red")
plt.legend()

#two ICA accuracy plots
#n_compo_ICA was determined by the threshold values
plt.figure()
plt.title("Accuracy: Neural Network on New Features ICA")
plt.xlabel("Number of Components/Features")
plt.ylabel("Accuracy of Classification")
plt.plot(n_compo_ICA, ICA_accu_arr, marker='', linewidth=2, label="ICA select Kurtosis")
plt.plot(n_compo_range, ICA_2_accu_arr, marker='', linewidth=2, label="ICA")
plt.axhline(y=original_accuracy, linestyle='dashed', color = "red")
plt.legend()

#RP multiple runs accuracy plot
plt.figure()
plt.title("Accuracy: Neural Network on New Features RP Multiple Runs")
plt.xlabel("Number of Runs")
plt.ylabel("Accuracy of Classification")
plt.plot(RP_runs_range, RP_accu_arr, marker='', linewidth=2, label="ICA")
plt.axhline(y=original_accuracy, linestyle='dashed', color = "red")
plt.legend()

#Running time plot
plt.figure()
plt.title("Running time: Neural Network on New Features")
plt.xlabel("Number of Components/Features")
plt.ylabel("Accuracy of Classification")
plt.plot(n_compo_range, PCA_time_arr, marker='', linewidth=2, label="PCA")
plt.plot(n_compo_range, ICA_2_time_arr, marker='', linewidth=2, label="ICA")
plt.plot(n_compo_range, RP_2_time_arr, marker='', linewidth=2, label="RP")
plt.axhline(y=original_time, linestyle='dashed', color = "red")
plt.legend()

plt.show()


In [33]:
#Task 5
#Create new data including the cluster results
#Recall in Task 3, ICA performs the best for clustering for both K means and EM
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn import metrics

#Will focus on the Forest Fire dataset only
df = df_set[0]
X = X_set[0]
labels_true = labels_true_set[0]
dataset_name = dataset_names[0]

n_compo = 5  #target number of clusters

pca_X_KM_set, pca_X_EM_set = ([], [])
ica_X_KM_set, ica_X_EM_set = ([], [])
rp_X_KM_set, rp_X_EM_set = ([], [])

#clustering on PCA results, add in new feture
for pca_X in pca_X_set:
    #get k means and EM clusters
    kmeans = KMeans(n_clusters=n_compo, random_state=0).fit(pca_X)
    em =  GaussianMixture(n_components= n_compo, max_iter = 500, random_state=0).fit(pca_X)
    labels_pred_kmeans = kmeans.predict(pca_X)
    labels_pred_em = em.predict(pca_X)
    #add the clustering feature into the data
    pca_X_KM = pca_X.copy()
    pca_X_KM['clustering'] = labels_pred_kmeans
    pca_X_KM_set.append(pca_X_KM)
    pca_X_EM = pca_X.copy()
    pca_X_EM['clustering'] = labels_pred_em
    pca_X_EM_set.append(pca_X_EM)
print ("PCA done")

#clustering on ICA results, add in new feture
for ica_X in ica_X_set:
    #get k means and EM clusters
    kmeans = KMeans(n_clusters=n_compo, random_state=0).fit(ica_X)
    em =  GaussianMixture(n_components= n_compo, max_iter = 500, random_state=0).fit(ica_X)
    labels_pred_kmeans = kmeans.predict(ica_X)
    labels_pred_em = em.predict(ica_X)
    #add the clustering feature into the data
    ica_X_KM = ica_X.copy()
    ica_X_KM['clustering'] = labels_pred_kmeans
    ica_X_KM_set.append(ica_X_KM)
    ica_X_EM = ica_X.copy()
    ica_X_EM['clustering'] = labels_pred_em
    ica_X_EM_set.append(ica_X_EM)
print ("ICA done")

#clustering on RP results, add in new feture
for rp_X in rp_X_set_2:
    #get k means and EM clusters
    kmeans = KMeans(n_clusters=n_compo, random_state=0).fit(rp_X)
    em =  GaussianMixture(n_components= n_compo, max_iter = 500, random_state=0).fit(rp_X)
    labels_pred_kmeans = kmeans.predict(rp_X)
    labels_pred_em = em.predict(rp_X)
    #add the clustering feature into the data
    rp_X_KM = rp_X.copy()
    rp_X_KM['clustering'] = labels_pred_kmeans
    rp_X_KM_set.append(rp_X_KM)
    rp_X_EM = rp_X.copy()
    rp_X_EM['clustering'] = labels_pred_em
    rp_X_EM_set.append(rp_X_EM)
print ("RP done")


PCA done
ICA done
RP done


In [34]:
#Task 5 continue
#Run NN - PCA
from sklearn.neural_network import MLPClassifier

k = 5  #number of trials

#On PCA clustering results
pca_accu_arr, pca_KM_accu_arr, pca_EM_accu_arr = ([], [], [])
pca_time_arr, pca_KM_time_arr, pca_EM_time_arr = ([], [], [])

for j in range(0, len(pca_X_set)):
    new_X = pca_X_set[j]
    new_X_KM = pca_X_KM_set[j]
    new_X_EM = pca_X_EM_set[j]
    df_set = [new_X, new_X_KM, new_X_EM]
    df_names = ["PCA", "PCA_KM", "PCA_EM"]
    for m in range(0, 3):
        start = time.time()
        df = df_set[m]
        df_name = df_names[m]
        tot_accuracy = 0
        for i in range (0, k):
            clf = MLPClassifier(hidden_layer_sizes=(50,)*(10))
            clf.fit(df, labels_true)
            predictions = clf.predict(df)
            delta = abs(predictions - labels_true)
            accuracy = 1 - np.mean(delta)
            tot_accuracy = tot_accuracy + accuracy
        accuracy = tot_accuracy/k
        t = (time.time() - start)/k
        if m == 0:
            pca_accu_arr.append(accuracy)
            pca_time_arr.append(t)
        if m == 1:
            pca_KM_accu_arr.append(accuracy)
            pca_KM_time_arr.append(t)
        if m == 2:
            pca_EM_accu_arr.append(accuracy)
            pca_EM_time_arr.append(t)
    if np.remainder(j+1, 3) == 0:
            print ("PCA num of compos = " + str(j+1) + " done")
print ("PCA")
print (pca_accu_arr)
print (pca_time_arr)
print ("PCA KM")
print (pca_KM_accu_arr)
print (pca_KM_time_arr)
print ("PCA EM")
print (pca_EM_time_arr)

PCA num of compos = 3 done
PCA num of compos = 6 done
PCA num of compos = 9 done
PCA num of compos = 12 done
PCA
[0.7083172147001935, 0.7241779497098646, 0.7524177949709865, 0.7601547388781432, 0.7547388781431333, 0.7767891682785301, 0.7872340425531915, 0.7686653771760155, 0.7849129593810444, 0.7810444874274662, 0.7829787234042553, 0.774468085106383]
[3.6715985774993896, 4.81003360748291, 5.580672025680542, 5.828806400299072, 4.261039733886719, 5.980933666229248, 5.0722503662109375, 4.741889381408692, 5.531773805618286, 4.949737787246704, 4.84015965461731, 5.070322799682617]
PCA KM
[0.7083172147001935, 0.723404255319149, 0.7624758220502902, 0.7644100580270793, 0.7705996131528046, 0.7705996131528046, 0.7798839458413926, 0.7678916827852998, 0.7740812379110252, 0.7880077369439071, 0.7880077369439071, 0.7880077369439072]
[3.841371774673462, 3.568989372253418, 6.188988780975341, 4.661727046966552, 5.601650810241699, 5.289709424972534, 5.46058759689331, 4.690083789825439, 5.471415615081787, 

In [35]:
#Task 5 continue
#Run NN - ICA
from sklearn.neural_network import MLPClassifier

#On ICA clustering results
ica_accu_arr, ica_KM_accu_arr, ica_EM_accu_arr = ([], [], [])
ica_time_arr, ica_KM_time_arr, ica_EM_time_arr = ([], [], [])

for j in range(0, len(ica_X_set)):
    new_X = ica_X_set[j]
    new_X_KM = ica_X_KM_set[j]
    new_X_EM = ica_X_EM_set[j]
    df_set = [new_X, new_X_KM, new_X_EM]
    df_names = ["ICA", "ICA_KM", "ICA_EM"]
    for m in range(0, 3):
        df = df_set[m]
        df_name = df_names[m]
        tot_accuracy = 0
        start = time.time()
        for i in range (0, k):
            clf = MLPClassifier(hidden_layer_sizes=(50,)*(10))
            clf.fit(df, labels_true)
            predictions = clf.predict(df)
            delta = abs(predictions - labels_true)
            accuracy = 1 - np.mean(delta)
            tot_accuracy = tot_accuracy + accuracy
        accuracy = tot_accuracy/k
        t = (time.time() - start)/k
        if m == 0:
            ica_accu_arr.append(accuracy)
            ica_time_arr.append(t)
        if m == 1:
            ica_KM_accu_arr.append(accuracy)
            ica_KM_time_arr.append(t)
        if m == 2:
            ica_EM_accu_arr.append(accuracy)
            ica_EM_time_arr.append(t)
    if np.remainder(j+1, 3) == 0:
            print ("ICA iteration " + str(j+1) + " done")
print ("ICA")
print (ica_accu_arr)
print (ica_time_arr)
print ("ICA KM")
print (ica_KM_accu_arr)
print (ica_KM_time_arr)
print ("ICA EM")
print (ica_EM_accu_arr)
print (ica_EM_time_arr)
print ("")



ICA iteration 3 done
ICA iteration 6 done
ICA
[0.732688588007737, 0.7365570599613153, 0.7079303675048356, 0.7114119922630561, 0.7079303675048356, 0.7195357833655708, 0.7079303675048356]
[3.769902801513672, 4.37024941444397, 1.021385383605957, 1.5894762039184571, 1.079571008682251, 4.230891370773316, 1.129565668106079]
ICA KM
[0.7284332688588007, 0.7292069632495164, 0.7264990328820116, 0.7264990328820117, 0.7245647969052225, 0.7226305609284334, 0.7079303675048356]
[4.842516803741455, 3.6998830318450926, 3.93770318031311, 6.360902929306031, 5.548589563369751, 6.090720558166504, 1.059845781326294]
ICA EM
[0.7597678916827852, 0.7257253384912958, 0.7415860735009672, 0.7261121856866538, 0.7268858800773694, 0.7210831721470019, 0.7083172147001935]
[6.988400411605835, 5.33152437210083, 7.062354803085327, 6.330043172836303, 5.340649557113648, 6.7194578647613525, 3.230103588104248]



In [36]:
#Task 5 continue
#Run NN - RP
from sklearn.neural_network import MLPClassifier

#On RP clustering results
rp_accu_arr, rp_KM_accu_arr, rp_EM_accu_arr = ([], [], [])
rp_time_arr, rp_KM_time_arr, rp_EM_time_arr = ([], [], [])

for j in range(0, len(rp_X_set_2)):
    new_X = rp_X_set_2[j]
    new_X_KM = rp_X_KM_set[j]
    new_X_EM = rp_X_EM_set[j]
    df_set = [new_X, new_X_KM, new_X_EM]
    df_names = ["RP", "RP_KM", "RP_EM"]
    for m in range(0, 3):
        df = df_set[m]
        df_name = df_names[m]
        tot_accuracy = 0
        start = time.time()
        for i in range (0, k):
            clf = MLPClassifier(hidden_layer_sizes=(50,)*(10))
            clf.fit(df, labels_true)
            predictions = clf.predict(df)
            delta = abs(predictions - labels_true)
            accuracy = 1 - np.mean(delta)
            tot_accuracy = tot_accuracy + accuracy
        accuracy = tot_accuracy/k
        t = (time.time() - start)/k
        if m == 0:
            rp_accu_arr.append(accuracy)
            rp_time_arr.append(t)
        if m == 1:
            rp_KM_accu_arr.append(accuracy)
            rp_KM_time_arr.append(t)
        if m == 2:
            rp_EM_accu_arr.append(accuracy)
            rp_EM_time_arr.append(t)
    if np.remainder(j+1, 3) == 0:
            print ("RP n_compo =" + str(j+1) + " done")
print ("RP")
print (rp_accu_arr)
print (rp_time_arr)
print ("RP KM")
print (rp_KM_accu_arr)
print (rp_KM_time_arr)
print ("RP EM")
print (rp_EM_accu_arr)
print (rp_EM_time_arr)
print ("")

RP n_compo =3 done




RP n_compo =6 done
RP n_compo =9 done




RP n_compo =12 done
RP
[0.7079303675048356, 0.7110251450676982, 0.7110251450676983, 0.7106382978723405, 0.7079303675048356, 0.7152804642166345, 0.711798839458414, 0.7079303675048356, 0.7079303675048356, 0.7090909090909092, 0.7098646034816247, 0.7114119922630561]
[4.45073184967041, 4.039693355560303, 4.797753429412841, 4.920052766799927, 3.138338232040405, 6.111123418807983, 4.831844997406006, 3.649802398681641, 3.3276978492736817, 4.359951162338257, 3.6307352066040037, 5.499486351013184]
RP KM
[0.7079303675048356, 0.7102514506769826, 0.7098646034816248, 0.7121856866537717, 0.7079303675048356, 0.7079303675048356, 0.7125725338491297, 0.7079303675048356, 0.7087040618955512, 0.7094777562862669, 0.7083172147001935, 0.7125725338491297]
[4.518898153305054, 4.218515205383301, 4.331560182571411, 5.060601377487183, 2.800454616546631, 2.8182868003845214, 6.038364410400391, 3.4900137901306154, 3.5017589569091796, 2.3028822422027586, 3.0213999271392824, 5.580772399902344]
RP EM
[0.7079303675048356,

In [37]:
#Task 5 continue
#Plot
%matplotlib qt

#Accuracy plot
plt.figure()
plt.title("Accuracy: Neural Network on New Features including Clustering Result")
plt.xlabel("Number of Components/Features")
plt.ylabel("Accuracy of Classification")
#PCA: blue
plt.plot(n_compo_range, pca_accu_arr, marker='', linewidth=2, color = 'b', label="PCA")
plt.plot(n_compo_range, pca_KM_accu_arr, marker='', linewidth=2, color = 'b', linestyle = 'dashed', label="PCA K means")
plt.plot(n_compo_range, pca_EM_accu_arr, marker='', linewidth=2, color = 'b', linestyle = 'dashdot', label="PCA EM")
#ICA: magenta (pink)
plt.plot(n_compo_ICA, ica_accu_arr, marker='', linewidth=2, color = 'm', label="ICA")
plt.plot(n_compo_ICA, ica_KM_accu_arr, marker='', linewidth=2, color = 'm', linestyle = 'dashed', label="ICA K means")
plt.plot(n_compo_ICA, ica_EM_accu_arr, marker='', linewidth=2, color = 'm', linestyle = 'dashdot', label="ICA EM")
#RP: green
plt.plot(n_compo_range, rp_accu_arr, marker='', linewidth=2, color = 'g', label="RP")
plt.plot(n_compo_range, rp_KM_accu_arr, marker='', linewidth=2, color = 'g', linestyle = 'dashed', label="RP K means")
plt.plot(n_compo_range, rp_EM_accu_arr, marker='', linewidth=2, color = 'g', linestyle = 'dashdot', label="RP EM")
#original_accuracy was obtained in Task 4
plt.axhline(y=original_accuracy, marker = "_", linestyle='dotted', color = "red")
plt.legend()

#Running time plot
plt.figure()
plt.title("Running Time: Neural Network on New Features including Clustering Result")
plt.xlabel("Number of Components/Features")
plt.ylabel("Running Time")
#PCA: blue
plt.plot(n_compo_range, pca_time_arr, marker='', linewidth=2, color = 'b', label="PCA")
plt.plot(n_compo_range, pca_KM_time_arr, marker='', linewidth=2, color = 'b', linestyle = 'dashed', label="PCA K means")
plt.plot(n_compo_range, pca_EM_time_arr, marker='', linewidth=2, color = 'b', linestyle = 'dashdot', label="PCA EM")
#ICA: magenta (pink)
plt.plot(n_compo_ICA, ica_time_arr, marker='', linewidth=2, color = 'm', label="ICA")
plt.plot(n_compo_ICA, ica_KM_time_arr, marker='', linewidth=2, color = 'm', linestyle = 'dashed', label="ICA K means")
plt.plot(n_compo_ICA, ica_EM_time_arr, marker='', linewidth=2, color = 'm', linestyle = 'dashdot', label="ICA EM")
#RP: green
plt.plot(n_compo_range, rp_time_arr, marker='', linewidth=2, color = 'g', label="RP")
plt.plot(n_compo_range, rp_KM_time_arr, marker='', linewidth=2, color = 'g', linestyle = 'dashed', label="RP K means")
plt.plot(n_compo_range, rp_EM_time_arr, marker='', linewidth=2, color = 'g', linestyle = 'dashdot', label="RP EM")
#original_time was obtained in Task 4
plt.axhline(y=original_time, marker = "_", linestyle='dotted', color = "red")
plt.legend()


plt.show()
