Use pandas to read a .csv file

In [1]:
import pandas as pd
import numpy as np
import scipy.spatial.distance as dist
import scipy.stats.mstats as mstats

filepath = '/root/projects/monty_python/AICS_Cell-feature-analysis_v1.5.csv'

df = pd.read_csv(filepath)

print(df)

         classes        Cell ID  Nuclear volume (fL)  Cellular volume (fL)  \
0          Tom20    AICS-11_0_2             26.40600              67.08528   
1          Tom20    AICS-11_0_3             19.34280              47.70900   
2          Tom20    AICS-11_0_4             29.33064              73.47996   
3          Tom20    AICS-11_0_5             21.96504              60.71868   
4          Tom20    AICS-11_0_6             27.75060              74.02212   
5          Tom20    AICS-11_1_2             24.55920              64.33668   
6          Tom20    AICS-11_1_3             35.02224              98.23572   
7          Tom20    AICS-11_1_5             31.79628             102.03730   
8          Tom20    AICS-11_2_1             24.27624              55.38672   
9          Tom20    AICS-11_2_2             39.85740             100.41190   
10         Tom20    AICS-11_2_3             22.52988              55.22256   
11         Tom20    AICS-11_2_4             22.71888            

Print off the column names, and store two variables, one that contains the class information, and one that contains the features

In [2]:
column_names = list(df.columns.values)

print(column_names)

features = df[column_names[2:]]
classes = df['classes']

['classes', 'Cell ID', 'Nuclear volume (fL)', 'Cellular volume (fL)', 'Nuclear surface area', 'Cellular surface area', 'Radial proximity (unitless)', 'Apical proximity (unitless)']


Index the class names, and convert the feature array to a numpy array.

In [3]:
uclass, class_inds = np.unique(classes, return_inverse=True)
nclasses = len(uclass)
ndat = len(class_inds)
features = features.as_matrix(columns=features.columns[0:])

features = mstats.zscore(features, 0)

Compute a distance matrix from all pairs of features

In [4]:
dmat = dist.squareform(dist.pdist(features))
np.fill_diagonal(dmat, float('Inf'))

min_inds = np.argmin(dmat,1)
pred_inds = class_inds[min_inds]

In [5]:
confmat = np.zeros((nclasses,nclasses))

for i in range(0, ndat):
    confmat[class_inds[i]-1, pred_inds[i]-1] +=1
    
print(confmat)

[[ 529.  101.   17.    3.    3.   13.   94.  176.    7.  100.]
 [  87.  224.    8.    0.    1.   22.   10.   41.    1.  148.]
 [  27.    3.   75.    4.    0.    5.   28.   12.   56.   19.]
 [   2.    0.    1.  944.   39.    0.    1.    0.    0.    1.]
 [   2.    1.    0.   42.  714.    3.   18.    4.    0.    1.]
 [  15.   24.    8.    2.    1.   40.   13.   16.    6.   32.]
 [ 106.   14.   29.    1.   12.   16.  606.   29.    0.   22.]
 [ 163.   47.   11.    0.    4.   17.   27.  428.    5.   69.]
 [   3.    0.   49.    1.    0.    3.    0.    6.  166.    6.]
 [  90.  137.   18.    1.    2.   29.   21.   65.    4.  126.]]


In [6]:
uclass

array(['Alpha actinin', 'Alpha tubulin', 'Beta actin', 'Desmoplakin',
       'Fibrillarin', 'Lamin B1', 'Myosin IIB', 'Sec61 beta', 'Tom20',
       'ZO1'], dtype=object)