In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
def get_feature_frame(file, benchmarks):
    data = pd.read_csv(file, header=None)
    data['benchmark'] = benchmarks
    return data.set_index('benchmark')
    #return data

def get_cmap(n, name='hsv'):
    '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)

In [None]:
npb = get_feature_frame('NPB/features.csv', ['bt', 'cg', 'ep', 'ft', 'is', 'lu', 'mg', 'sp', 'ua'])
spec = get_feature_frame('SPEC/features.csv', ['bwaves', 'botsalgn', 'botsspar', 'fma3d', 'ilbdc', 'md', 'nab', 'smithwa', 'swim'])

In [None]:
npb.head()

In [None]:
def visualise_pca(frames, names):
    data = pd.concat(frames)
    
    cmap = get_cmap(len(data))
    markers = ['o', 'x']
    
    # Run PCA
    X = StandardScaler().fit_transform(data.values)
    pca = PCA(n_components=5)
    principal_components = pca.fit_transform(X)
    component_labels = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']

    # Generate SCREE Plot
    fig = plt.figure(figsize=(16,6))
    ax = fig.add_subplot(1, 2, 1)
    ax.bar(component_labels, pca.explained_variance_ratio_)
    ax.set_title('Explained Variance of Principal Components')
    ax.set_xlabel('Principal Component')
    ax.set_ylabel('Explained Variance Ratio')
    
    principal_frame = pd.DataFrame(data=principal_components, columns=component_labels)
    
    ax = fig.add_subplot(1, 2, 2)
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    ax.set_title('2 Component PCA for NPB and SPEC using Milepost features')
    
    start_index = 0
    for i in range(len(frames)):
        end_index = start_index + len(frames[i])
        
        ax.scatter(principal_frame.iloc[start_index:end_index]['PC1'], 
                   principal_frame.iloc[start_index:end_index]['PC2'], 
                   c=cmap(i),
                   marker=markers[i],
                   label=names[i])
        start_index = end_index
        
    ax.legend()
    

In [None]:
visualise_pca([npb, spec], ['NPB', 'SPEC'])

In [None]:
visualise_pca([spec], ['SPEC'])

In [None]:
visualise_pca([npb], ['NPB'])

In [None]:



#ax.scatter(principal_frame.iloc[len(npb):len(principal_frame)]['PC1'], principal_frame.iloc[len(npb):len(principal_frame)]['PC2'], label='SPEC')

# for i in range(len(principal_frame)):
#     benchmark = principal_frame.index[i]
#     ax.scatter(principal_frame.loc[benchmark, 'PC1'], principal_frame.loc[benchmark, 'PC2'], c=cmap(i), label=benchmark)

