In [3]:
#=======================================================
# Purpose: PCA of Gene Expression Matrix with known GA
# Author:  Minghan Chen
# Data:    Sep 18, 2019
# Rev:     Sep 29, 2019
#=======================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


## load and set up data
# read data file
gene_file = 'gene_known.csv'
df_X = pd.read_csv(gene_file, sep = ',')

ga_file = 'ga_known.csv'
df_y = pd.read_csv(ga_file, sep = ',')


## Processing
# Standardizing the features
X = StandardScaler().fit_transform(df_X.iloc[:,1:])

# PCA
pca = PCA()
X_pca = pca.fit(X).transform(X)


SMALL_SIZE = 14
MEDIUM_SIZE = 16
BIGGER_SIZE = 18

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title


# plot bar of each pca component
plt.figure(1)
per_var = np.round(pca.explained_variance_ratio_*100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]

plt.plot(range(1,11), per_var[0:10])

# print( sum(per_var[0:200]) )


# plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label = labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')


# plot PC1 and PC2
plt.figure(2)
plt.axis('equal')
pca_df = pd.DataFrame(X_pca, index=df_y.iloc[:,0], columns=labels)
plt.scatter(pca_df.PC1, pca_df.PC2, c=df_y.iloc[:,1])
cbar = plt.colorbar()
# plt.title('PCA of Gene Expression Matrix (367 samples with GA values)')
cbar.set_label("Gestational Age (Week)")
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
plt.savefig('PCA_full_gene.eps', format='eps', bbox_inches='tight')

plt.show()

print(pca_df)

print()

#for i in range (len(PCA))


# print top 10 genes
#loading_scores = pd.Series(pca.components_[0], index=df_X.columns.values[1:])
#sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
#top_200_genes = sorted_loading_scores[0:200].index

#print(top_200_genes[0])

#print(pd.DataFrame(top_200_genes))

#output_file = open("top_200_genes_in_PCA.txt","w+")



#for i in range(len(top_200_genes)):
  #  output_file.write('\n')
  #  output_file.write(top_200_genes[i][:-3])
  #  output_file.write('\n')
    

#output_file.close()


## print(loading_scores[top_200_genes])


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 2 Axes>

                        PC1         PC2        PC3         PC4        PC5  \
SampleID                                                                    
Tarca_001_P1A01   48.415217  -23.172146 -36.681264   84.067689  46.060434   
Tarca_013_P1B01   72.030665   -7.379045 -33.666351    8.737875  48.492014   
Tarca_025_P1C01   62.666571  -18.327693  -7.635471   -3.471000  43.024159   
Tarca_037_P1D01 -202.552636  -91.410888 -18.471907    6.705084  48.393191   
Tarca_049_P1E01  110.655232  -31.129540 -49.265705   26.342019  23.132400   
Tarca_061_P1F01  -79.117555  -73.173440  24.680049  -16.930545  40.096644   
Tarca_051_P1E03 -242.327327   -6.216876 -73.381941   32.589202  54.270219   
Tarca_063_P1F03   80.481703   23.550617 -65.573535  -23.542636   4.511838   
Tarca_075_P1G03   71.964441   51.565472 -36.168508  -25.378213  -1.962220   
Tarca_087_P1H03  -36.891458   14.018798 -36.565661   -1.411083 -24.998458   
Tarca_004_P1A04  169.297469  -11.880978 -32.995409  -13.135749 -22.915347   

In [4]:
PCA_200 = (pca_df.iloc[0:,0:200])

# print (test)

# print (PCA_200)

sample_name = []

for i in range(len(df_y)):
    sample_name.append(df_y.iloc[i,0])

print (PCA_200)
    
# np.savetxt('Top_200_PCA.csv',PCA_200)

# print(sample_name)

# print(X_pca)
# print(labels)

                        PC1         PC2        PC3         PC4        PC5  \
SampleID                                                                    
Tarca_001_P1A01   48.415217  -23.172146 -36.681264   84.067689  46.060434   
Tarca_013_P1B01   72.030665   -7.379045 -33.666351    8.737875  48.492014   
Tarca_025_P1C01   62.666571  -18.327693  -7.635471   -3.471000  43.024159   
Tarca_037_P1D01 -202.552636  -91.410888 -18.471907    6.705084  48.393191   
Tarca_049_P1E01  110.655232  -31.129540 -49.265705   26.342019  23.132400   
Tarca_061_P1F01  -79.117555  -73.173440  24.680049  -16.930545  40.096644   
Tarca_051_P1E03 -242.327327   -6.216876 -73.381941   32.589202  54.270219   
Tarca_063_P1F03   80.481703   23.550617 -65.573535  -23.542636   4.511838   
Tarca_075_P1G03   71.964441   51.565472 -36.168508  -25.378213  -1.962220   
Tarca_087_P1H03  -36.891458   14.018798 -36.565661   -1.411083 -24.998458   
Tarca_004_P1A04  169.297469  -11.880978 -32.995409  -13.135749 -22.915347   

In [12]:
import csv
import pandas as pd

# df = pd.DataFrame(PCA_200)

TH_X_pca = X_pca[0:, 0:200]

output = TH_X_pca.tolist()

# print (pd.DataFrame(output[0]))
# print (pd.DataFrame(output))
# print (pd.DataFrame(sample_name))



for i in range (len(output)):
    output[i].insert(0, sample_name[i])
    ## print (output[i])
    
    ##output[i].insert(0,sample_name[i])

output.insert(0,labels[0:200])

print(output)

# csvwrite('file.csv',output)

with open('Top_200_PCA.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(output)

AttributeError: 'numpy.ndarray' object has no attribute 'value'

In [5]:
Tfile = 'Top_200_PCA.csv'
df_T = pd.read_csv(Tfile, sep = ',')
print (df_T)

                        PC1         PC2        PC3         PC4        PC5  \
Tarca_001_P1A01   48.415217  -23.172146 -36.681264   84.067689  46.060434   
Tarca_013_P1B01   72.030665   -7.379045 -33.666351    8.737875  48.492014   
Tarca_025_P1C01   62.666571  -18.327693  -7.635471   -3.471000  43.024159   
Tarca_037_P1D01 -202.552636  -91.410888 -18.471907    6.705084  48.393191   
Tarca_049_P1E01  110.655232  -31.129540 -49.265705   26.342019  23.132400   
Tarca_061_P1F01  -79.117555  -73.173440  24.680049  -16.930545  40.096644   
Tarca_051_P1E03 -242.327327   -6.216876 -73.381941   32.589202  54.270219   
Tarca_063_P1F03   80.481703   23.550617 -65.573535  -23.542636   4.511838   
Tarca_075_P1G03   71.964441   51.565472 -36.168508  -25.378213  -1.962220   
Tarca_087_P1H03  -36.891458   14.018798 -36.565661   -1.411083 -24.998458   
Tarca_004_P1A04  169.297469  -11.880978 -32.995409  -13.135749 -22.915347   
Tarca_064_P1F04   72.504029   20.090708 -31.995271   11.945937 -19.456906   