In [1]:
import sys
sys.path.append("./scripts")
from helpers import *
from proj1_helpers import *
import numpy as np
import csv
from math import sqrt
from matplotlib import pyplot as plt

Prepare the data by standardizing and Imputing Missing data with Mean

In [2]:
def prepare_data(dataPath):

    yb , input_data, ids = load_csv_data(dataPath) # load data
    # replace missing data with mean
    input_data[input_data == -999]  = None
    col_mean = np.nanmean(input_data)
    emptyIndex = np.isnan(input_data)
    input_data[emptyIndex] = col_mean
    # call build model to construct the matrix then remove the column of one's in the feature set
    y, tx = build_model_data(input_data,yb)
    tx = np.delete(tx, 0, axis = 1)
    # standardize the data
    standardized_data,mean,std = standardize(tx)
    return  standardized_data


Compute the Correalation Matrix  ; Corr[IJ] = (E[IJ] - E[I]E[J])/sqrt(Var(I)Var(J))

In [28]:
def compute_correlation(data):

    dimension = data.shape[1] # get the number of columns in the data
    correlation_matrix  = np.zeros((dimension, dimension)) # construct a matrix to save the correlation coeff inside
    for i in range(0, dimension): # iterate over all columns
        for j in range(0, dimension): # iterate over all columns
            if i==j : # if we are in the same columns then same feature so correlation = 1
                correlation_matrix[i][j]=1
            else : 
                # to reduce number of computations since corr(feature(I)feature(j)) = corr(feature(J)feature(I))
                if correlation_matrix[i][j] == 0 :  
                    col_i = data[:,i] # get the ith column
                    col_j = data[:,j] # get the jth column
                    vecIJ = np.multiply(col_i,col_j) # compute dot product of both columns
                    mean_IJ = np.mean(vecIJ) # compute the expected value of the joint density of the columns
                    mean_col_i = np.mean(col_i) # expected value of column i
                    mean_col_j = np.mean(col_j) # expected value of column j
                    variance_col_i = np.var(col_i) # variance of column i
                    variance_col_j = np.var(col_j) # variance of column j
                    # hence Corr(I,J) =  (E(IJ) - E(I)E(J)) / radical (Var(I)Var(J))
                    correlation_coefficient = (mean_IJ  - (mean_col_i * mean_col_j))/sqrt(variance_col_i * variance_col_j)
                    correlation_matrix[i][j] = correlation_coefficient
                    correlation_matrix[j][i] = correlation_coefficient


    return correlation_matrix

Change format of result to save to csv folder

In [22]:
def parse_and_save(result,labels):
    
    # change each row in a correlation matrix into a line of csv
    parse = []
    for i in range(0,result.shape[0]):
        line = labels[i]
        for j in range(0,result.shape[1]):
            line = line + "," + str(result[i][j])
        parse.append(line)
       
    organizeLabels = "Labels"
    for i in range(0,labels.shape[0]):
             organizeLabels= organizeLabels + "," + str(labels[i])
    
    # write result to csv file
    with open('./analysis/correlation.csv','w') as file:
        
        file.write(organizeLabels)
        file.write('\n')
                
        for line in parse:
            file.write(line)
            file.write('\n')
    

To visualize a heatmap plot

In [38]:
import matplotlib.pyplot as plt
def plot_correlation(correlation_matrix,labels,filePath):

    plt.figure(figsize=(12,12))
    plt.imshow(correlation_matrix, cmap='RdYlGn', interpolation='none', aspect='auto')
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation='vertical')
    plt.yticks(range(len(labels)), labels);
    plt.suptitle('Higgs Bosson Correlation Heat Map', fontsize=15, fontweight='bold')
    plt.savefig(filePath)
    plt.close('all')

In [41]:
def run():
    dataPath = '../data/train.csv'
    
    
   # data = np.delete(data,14, axis = 1)
   # data = np.delete(data,14, axis = 1)
   # data = np.delete(data,15, axis = 1)
   # data = np.delete(data,15, axis = 1)
   # data = np.delete(data,16, axis = 1)

    with open (dataPath, "r") as csvfile:
            reader = csv.reader(csvfile)
            features= next(reader)

    features = np.asarray(features) 
    features = np.delete(features,0)
    features = np.delete(features,0)
  
    for power in range(6,13):
        
        data = prepare_data(dataPath)
        data[:,14] =  np.power(data[:,14],power)
        data[:,15] =  np.power(data[:,15],power)
        data[:,17] =  np.power(data[:,17],power)
        data[:,18] =  np.power(data[:,18],power)
        data[:,20] =  np.power(data[:,20],power)
        correlation_matrix = compute_correlation(data)
        fileName = "./analysis/correlation" + str(power) + ".pdf"
        plot_correlation(correlation_matrix,features,fileName)    
        
   
   # features = np.delete(features,14)
   # features = np.delete(features,14)
   # features = np.delete(features,15)
   # features = np.delete(features,15)
   # features = np.delete(features,16) 
  
        
   

In [42]:
run()