In [2]:
#Import packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import os
from scipy import stats
import scipy as sp


In [None]:
#Import Dataset - Make sure wordking directory is in the right place
#Set working directory and print it
folder_directory= 'folder/here'

os.chdir(folder_directory)
print(os.getcwd())

In [None]:
#Read in the dataframes

data= #insert data file name here with .csv at the end, wrapped around '' e.g. 'student1.csv'

data_frame=pd.read_csv(data)


In [None]:
#Remove NA values from data set so they don't impact later analysis

data_frame_reduced=data_frame.dropna()

#check the dataframe so nothing crazy has happened 

data_frame_reduced.info()


Step 1: Standardisation or scaling the data
This is to enable analysis on parameters that have very different values e.g. Aspect ratio is 0-1 and area might be 100. 
Essentially you want to standardise the range of the continuous initial variables so that each of them contributes equally to the analysis. 
if there are large differences between the ranges of initial variables, those variables with larger ranges will dominate over those with small ranges.

In [None]:
#PCA analysis to reduce the dimentions of the data to be ploted in 2D

from sklearn.preprocessing import StandardScaler

#Include the features in your analysis

features=['extent','major_axis_length','minor_axis_length','eccentricity',
         'roundness','circularity','area','cell_shape_index','perimeter']

# Separating out the features
x = data_frame_reduced.loc[:, features].values

# Separating out the target
y = data_frame_reduced.loc[:,['cell_type']].values

# Standardizing the features
x = StandardScaler().fit_transform(x)


Step 2: Covariance matrix computation
The aim of this step is to understand how the variables of the input data are varying from the mean with respect to each other. In other words to see if there is a relationship between them. 
Sometimes variables are highly correlated with each other in such as way that they contain redundant information. 
So in order to identify such correlations we compute the covariance matrix

The covariance matrix is a p × p symmetric matrix (where p is the number of dimensions) that has as entries the covariances associated with all possible pairs of the initial variables. For example, for a 3-dimensional data set with 3 variables x, y, and z, the covariance matrix is a 3×3 matrix of this from:

What do the covariance in the matrix tell us about the variables?
If positive then: the two variables increase or decrease together (correlated)
If negative then: one increases when the other decreases (Inversely correlated)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2) #Select number of compotents, 2 will give PC1 and PC2
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
data_frame_reduced= data_frame_reduced.reset_index()

finalDf = pd.concat([principalDf, data_frame_reduced[['cell_type']]], axis = 1)

#Check the dataframe
print(principalDf)

print(finalDf)  

In [None]:
#plot the results

fig = plt.figure(figsize = (8,8))  #set figure size
ax = fig.add_subplot(1,1,1)  #one plot 

ax.tick_params(width=3)   #width of the axis ticks

for axis in ['top','bottom','left','right']:   #loop to set all axes widths
    ax.spines[axis].set_linewidth(3)
    
ax.set_xlabel('PC1', fontsize = 15, fontname="Helvetica")  #x and y labels
ax.set_ylabel('PC2', fontsize = 15, fontname="Helvetica")
targets = ['secondary', 'tertiary']   
colors = ['m', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['cell_type'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 60)
    
ax.legend(targets, fontsize=15)

sns.despine()
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)



#plt.savefig('name.png', dpi=500)   #Take away the first hash to save and replace 'name' with something meaningful

In [None]:
#This is retrieving all of the parameter contributions to the analysis i.e. the eigenvectors and values from the covariecne matrix

pca_out = PCA().fit(x)

loadings = pca_out.components_
num_pc = pca_out.n_features_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df['variable'] = features
loadings_df = loadings_df.set_index('variable')
loadings_df

#display the results as a heatmap

ax = sns.heatmap(loadings_df, annot=True, cmap='Spectral')
plt.show()

In [None]:
from mlxtend.plotting import plot_pca_correlation_graph  #import the package for the correlation circle plotting

feature, correlatoin_matrix = plot_pca_correlation_graph(x,
                                                        features,
                                                        dimensions=(1, 2),
                                                        figure_axis_size=10)

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('title', fontsize=20)

#plt.savefig('name.png', dpi=500)