In [1]:
import os
from google.colab import drive

drive.mount('/content/drive')
dataset = '/content/drive/My Drive/visual/data/preprocessed_data.csv' 
to_save = '/content/drive/My Drive/visual/data/data.csv'  
print(dataset)

Mounted at /content/drive
/content/drive/My Drive/visual/data/preprocessed_data.csv


In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [3]:
def generateFile(label,Y,dataFile):
    att=label+['PCA_1','PCA_2']
    fout=open(to_save,'w')
    fin=open(dataFile)

    # write header
    for i in range(len(att)-1):
        print(att[i].strip(),',',end='',file=fout)
    print(att[-1].strip(),file=fout)

    # skip file in header
    s=fin.readline().strip()

    # write values
    for i in range(len(Y)-1):
          s=fin.readline().strip()
          print(s,',',Y[i][0],',',Y[i][1],file=fout)   
    fout.close()


att=['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides_pop', 'gdp_for_year', 'gdp_per_capita', ]

Read data from CSV file and delete columns with attributes that we do not use for dim-reduction, that are 'country', 'year', 'sex', 'age'

In [4]:
d = pd.io.parsers.read_csv(dataset).to_numpy()
d = np.delete(d, [0,1,2,3], 1)  
print(d)

[[21 312900 6.71 2156624900 796]
 [16 308000 5.19 2156624900 796]
 [14 289700 4.83 2156624900 796]
 ...
 [60 2762158 2.17 63067077179 2309]
 [44 2631600 1.67 63067077179 2309]
 [21 1438935 1.46 63067077179 2309]]


Apply standardization and PCA

In [5]:
# normalize the data with StandardScaler
d_std = np.nan_to_num(preprocessing.StandardScaler().fit_transform(d))

# compute PCA
pca = PCA()
d_pca = pca.fit_transform(d_std)
# d_pca is a numpy array with transformed data and pca is a
# PCA variable  with useful attributes (e.g., explained_variance_)

In [6]:
print(pca)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)


In [7]:
print(d_pca)

[[-0.74740127  0.19892351 -0.69579194 -0.06503264  0.21886193]
 [-0.76254567  0.13669734 -0.73607191 -0.04009215  0.22947235]
 [-0.76918768  0.12196452 -0.74416708 -0.0345129   0.23489862]
 ...
 [-0.35073855 -0.06198662 -0.95612282  0.00391314 -0.18297914]
 [-0.38349085 -0.08553667 -0.95788745  0.00334634 -0.1607709 ]
 [-0.57801308 -0.0807268  -0.86554947  0.01041627  0.05670586]]


Make new data file

In [8]:
generateFile(att, d_pca, dataset)

Analysis on results

In [9]:
d_cov = np.cov(d.T.astype(float))
for i in range(len(d_cov)):
    print('Variance original data axis X'+str(i+1), d_cov[i][i])
print('Covariance matrix')

for i in range(len(d_cov)):
    for j in range(len(d_cov[0])):
        print('%.2f ' % (d_cov[i][j]), end='\t')
        #print(str(d_pca[i][j])[:6]+' ', end='')
    print()
print('-------------------------------------')

d_cov = np.cov(d_std.T)
for i in range(len(d_cov)):
    print('Variance original normalized data axis X'+str(i+1), d_cov[i][i])

print('Covariance matrix')
for i in range(len(d_cov)):
    for j in range(len(d_cov[0])):
        print('%.2f ' % (d_cov[i][j]), end='\t')
        #print(str(d_pca[i][j])[:6]+' ', end='')
    print()
print('-------------------------------------\n')

d_cov = np.cov(d_pca.T)
for i in range(len(d_cov)):
    print('Variance transformed data axis Y'+str(i+1), d_cov[i][i])

print('Covariance matrix')
for i in range(len(d_cov)):
    for j in range(len(d_cov[0])):
        print('%.2f ' % (d_cov[i][j]), end='\t')
        #print(str(d_pca[i][j])[:6]+' ', end='')
    print()
print('-------------------------------------\n')

# compute and sort eigenvalues
v = pca.explained_variance_ratio_
print('Cumulated variance of the first two PCA components:',
      (v[0]+v[1]))


Variance original data axis X1 813690.4442639098
Variance original data axis X2 15302018400947.727
Variance original data axis X3 359.53889995312596
Variance original data axis X4 2.112981991227135e+24
Variance original data axis X5 356740544.9934135
Covariance matrix
813690.44 	2174197877.08 	5244.22 	563952803745438.12 	1044905.54 	
2174197877.08 	15302018400947.73 	614523.37 	4041168296153163264.00 	6022277083.17 	
5244.22 	614523.37 	359.54 	695671256069.23 	639.32 	
563952803745438.12 	4041168296153163264.00 	695671256069.23 	2112981991227134939496448.00 	8330023033854971.00 	
1044905.54 	6022277083.17 	639.32 	8330023033854971.00 	356740544.99 	
-------------------------------------
Variance original normalized data axis X1 1.0000359466551634
Variance original normalized data axis X2 1.0000359466551643
Variance original normalized data axis X3 1.0000359466551638
Variance original normalized data axis X4 1.0000359466551625
Variance original normalized data axis X5 1.00003594665516