In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cbook as cbook

RE_DELIMITER = r' '
file = "data\\sex_pca5_kinship_ethnic.txt"

In [2]:
df = pd.read_csv(file, sep = ' ')

In [3]:
df.head(10)

Unnamed: 0,f.31.0.0,f.22001.0.0,f.22009.0.1,f.22009.0.2,f.22009.0.3,f.22009.0.4,f.22009.0.5,f.22021.0.0,f.21000.0.0
0,1.0,1.0,-10.9392,5.49112,-0.502779,-3.21233,-3.00912,0.0,1001.0
1,0.0,0.0,-11.868,1.66229,-2.48932,3.5385,-2.19223,0.0,1001.0
2,1.0,1.0,-12.6358,4.03923,-1.08688,7.58518,10.8236,1.0,1001.0
3,0.0,,,,,,,,1001.0
4,0.0,0.0,-14.272,5.29456,-1.1028,5.55645,12.0171,0.0,1001.0
5,0.0,0.0,-11.6204,3.95235,-3.16063,-1.64647,-6.45915,0.0,1001.0
6,1.0,1.0,-13.1754,3.44237,-2.95845,8.96281,13.1731,1.0,1001.0
7,1.0,1.0,-5.9724,1.95475,-0.172913,-8.65122,-5.61762,0.0,1001.0
8,1.0,1.0,23.1869,-13.7141,36.4993,-67.8514,2.66981,0.0,1001.0
9,0.0,0.0,-13.46,5.28783,-0.097231,-0.538677,-5.10314,0.0,1001.0


In [4]:
# f.eid: id of a participant
# f.31.0.0 reported sex (phenotype)
# f.22001.0.0 genetic sex
# f.22009.0.1 - f.22009.0.3 top 3 principle components for genetic data
# f.22021.0.0 kinship
eid = "f.eid"
sex_n, sex_g = "f.31.0.0", "f.22001.0.0"
pcs = ["f.22009.0.{}".format(i) for i in range(1, 6)]
print(pcs)
kinship = "f.22021.0.0"
ethnic = "f.21000.0.0"


['f.22009.0.1', 'f.22009.0.2', 'f.22009.0.3', 'f.22009.0.4', 'f.22009.0.5']


In [5]:
ethnic_labels = {}
ethnic_labels[1] = "White"
ethnic_labels[2] = "Mixed"
ethnic_labels[3] = "Asian or Asian British"
ethnic_labels[4] = "Asian or Asian British"
ethnic_labels[5] = "Chinese"
ethnic_labels[6] = "Other ethnic group"
ethnic_labels[-1] = "Do not know"
ethnic_labels[-3] = "Prefer not to answer"
ethnic_labels[1001] = "British"
ethnic_labels[2001] = "White and Black Caribbean"
ethnic_labels[3001] = "Indian"
ethnic_labels[4001] = "Caribbean"
ethnic_labels[1002] = "Irish"
ethnic_labels[2002] = "White and Black African"
ethnic_labels[3002] = "Pakistani"
ethnic_labels[4002] = "African"
ethnic_labels[1003] = "Any other white background"
ethnic_labels[2003] = "White and Asian"
ethnic_labels[3003] = "Bangladeshi"
ethnic_labels[4003] = "Any other Black background"
ethnic_labels[2004] = "Any other mixed background"
ethnic_labels[3004] = "Any other Asian background"

In [6]:
#df.describe()

In [7]:
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
print(df.shape)

(487729, 9)


In [None]:
#df.head(10)

In [None]:
#df.describe()

In [8]:
df_clean = df[(df[sex_n] == df[sex_g]) & (df[kinship] > -1) & (df[kinship] < 1)]
df_clean = df_clean.astype({ethnic: 'int32'}) # convert ethnic type to int

In [9]:
print(df_clean.shape)
df_clean.head(10)

(338883, 9)


Unnamed: 0,f.31.0.0,f.22001.0.0,f.22009.0.1,f.22009.0.2,f.22009.0.3,f.22009.0.4,f.22009.0.5,f.22021.0.0,f.21000.0.0
0,1.0,1.0,-10.9392,5.49112,-0.502779,-3.21233,-3.00912,0.0,1001
1,0.0,0.0,-11.868,1.66229,-2.48932,3.5385,-2.19223,0.0,1001
4,0.0,0.0,-14.272,5.29456,-1.1028,5.55645,12.0171,0.0,1001
5,0.0,0.0,-11.6204,3.95235,-3.16063,-1.64647,-6.45915,0.0,1001
7,1.0,1.0,-5.9724,1.95475,-0.172913,-8.65122,-5.61762,0.0,1001
8,1.0,1.0,23.1869,-13.7141,36.4993,-67.8514,2.66981,0.0,1001
9,0.0,0.0,-13.46,5.28783,-0.097231,-0.538677,-5.10314,0.0,1001
10,0.0,0.0,-12.2885,6.14563,-1.77076,-0.042921,-0.883904,0.0,1001
12,0.0,0.0,-9.48475,5.21706,-1.8854,-0.049029,-1.75883,0.0,1001
13,0.0,0.0,-12.0551,3.70951,-1.28498,2.25678,-6.46714,0.0,1001


In [None]:
df_clean.describe()

In [None]:
pca_df = df_clean.loc[:, pcs[0]: ethnic]
pca_np = pca_df.to_numpy()
ethnic_keys = np.unique(pca_np[:, 6].astype(int)).tolist()

# view sample counts of each ethnic background
counts = {}
for ethnic_key in ethnic_keys:
    counts[ethnic_labels[ethnic_key]] = pca_np[pca_np[:, 6] == ethnic_key].shape[0]
    
counts = dict(sorted(counts.items(), key=lambda item: item[1], reverse = True))    
sorted_ethnic_labels = [];
for ethnic_label in counts.keys():
    sorted_ethnic_labels.insert(0, ethnic_label)
    print("{:>30}: {}".format(ethnic_label, counts[ethnic_label]))

In [None]:
colors = {
    "British": "blue",
    "Any other white background": "darkblue",
    "Irish": "lightgreen",
    "Indian": "green",
    "Other ethnic group": "greenyellow",
    "Caribbean": "pink",
    "African": "purple",
    "Any other Asian background": "olive",
    "Pakistani": "orange",
    "Chinese": "red",
    "Prefer not to answer": "black",
    "Any other mixed background": "cyan",
    "White and Asian": "gold",
    "White and Black Caribbean": "brown",
    "White": "chocolate",
    "White and Black African": "wheat",
    "Bangladeshi": "sienna",
    "Do not know": "powderblue",
    "Any other Black background": "mistyrose",
    "Mixed": "papayawhip",
    "Asian or Asian British": "firebrick"
}

In [None]:
def get_key_by_label(ethnic_labels, label):
    for key in ethnic_labels.keys():
        if ethnic_labels[key] == label:
            return key
    
    
def scatter_plot_2D(pca_np, sorted_ethnic_labels, ethnic_label_map, 
                    pc_x = 0, pc_y = 1):
    fig, ax = plt.subplots(figsize=(6,4))
    pca_np[:, 6] = pca_np[:, 6].astype(int)
    scatters, labels = [], []
    for ethnic_label in sorted_ethnic_labels:
        ethnic_key = get_key_by_label(ethnic_label_map, ethnic_label)
        data = pca_np[pca_np[:, 6] == ethnic_key]
        #print("ethnic_label: {}, category: {}, samples: {}".format(
        #    ethnic_label, data[0,6], data.shape))
        scatters.append(
            ax.scatter(data[:, pc_x], data[:, pc_y], 
                       c = colors[ethnic_label],
                       s = 0.2, 
                       alpha = 1, 
                       label = ethnic_label,
                       edgecolors='none'))
        labels.append(ethnic_label)


    ax.set_xlabel(r'$PC_{}$'.format(pc_x+1), fontsize=10)
    ax.set_ylabel(r'$PC_{}$'.format(pc_y+1), fontsize=10)
    ax.set_title('Principle Components ({}-{})'.format(pc_x+1, pc_y+1))
    
    
    #fig.tight_layout()
    ax.legend(scatters,
              labels,
              scatterpoints=1,
              loc='best', 
              ncol=2,
              bbox_to_anchor=(1, 1),
              numpoints = 1,
              scatteryoffsets = [0.375, 0.5, 0.3125],
              markerscale = 20,
              frameon = False,
              fancybox = False,
              framealpha = 1.0,
              facecolor = "black",
              title = "self reported ethnic background",
              title_fontsize = 12,
              borderpad = 0.2,
              fontsize=8) 
    ax.grid(True)
    plt.show()

In [None]:
print(ethnic_labels)
print(pca_np.shape)

In [None]:
scatter_plot_2D(pca_np, sorted_ethnic_labels, ethnic_labels, 0, 1)

In [None]:
scatter_plot_2D(pca_np, sorted_ethnic_labels, ethnic_labels, 0, 2)

In [None]:
scatter_plot_2D(pca_np, sorted_ethnic_labels, ethnic_labels, 1, 2)

In [None]:
scatter_plot_2D(pca_np, sorted_ethnic_labels, ethnic_labels, 2, 3)

In [None]:
def scatter_plot_3D(pca_np, sorted_ethnic_labels, ethnic_label_map, 
                    pc_x = 0, pc_y = 1, pc_z = 2):
    fig = plt.figure(1, figsize=(8, 6))
    ax = Axes3D(fig, 
                rect=[0, 0, 8, 6], 
                elev=48, 
                azim=134,
                frame_on = False
                #animated = True
               )
    pca_np[:, 6] = pca_np[:, 6].astype(int)
    scatters, labels = [], []
    for ethnic_label in sorted_ethnic_labels:
        ethnic_key = get_key_by_label(ethnic_label_map, ethnic_label)
        data = pca_np[pca_np[:, 6] == ethnic_key]
        scatters.append(
            ax.scatter(data[:, pc_x], data[:, pc_y], data[:, pc_z], 
                       c = colors[ethnic_label],
                       #s = 0.2, 
                       #alpha = 1, 
                       #label = ethnic_label,
                       edgecolor='k'))
        
        labels.append(ethnic_label)
        
    #fig.tight_layout()
    ax.legend(scatters,
              labels,
              scatterpoints=1,
              loc='best', 
              ncol=1,
              bbox_to_anchor=(0.9, 0.9),
              numpoints = 1,
              scatteryoffsets = [0.375, 0.5, 0.3125],
              markerscale = 3,
              frameon = False,
              fancybox = False,
              framealpha = 1.0,
              facecolor = "black",
              title = "self reported ethnic background",
              title_fontsize = 20,
              borderpad = 0.2,
              fontsize = 20) 
    ax.grid(True)
    
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('PC_{}'.format(pc_x), fontsize = 20)
    ax.set_ylabel('PC_{}'.format(pc_y), fontsize = 20)
    ax.set_zlabel('PC_{}'.format(pc_z), fontsize = 20)
    ax.set_title("Principle Components ({}-{}-{})".format(pc_x+1, pc_y+1, pc_z+1),
                 fontsize = 30)
    ax.dist = 12
    plt.show()


In [None]:
scatter_plot_3D(pca_np, sorted_ethnic_labels, ethnic_labels, 0, 1, 2)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# Though the following import is not directly being used, it is required
# for 3D projection to work
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn import datasets

np.random.seed(5)

X = pca_np

estimators = [('k_means_iris_4', KMeans(n_clusters=4)),
              ('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1,
                                               init='random'))]

fignum = 1
titles = ['4 clusters', '3 clusters, bad initialization']
for name, est in estimators:
    fig = plt.figure(fignum, figsize=(8, 6))
    ax = Axes3D(fig, rect=[0, 0, 8, 6], elev=48, azim=134)
    est.fit(X)
    labels = est.labels_

    ax.scatter(X[:, 0], X[:, 1], X[:, 2], 
               c=labels.astype(float),
               edgecolor='k')

    #ax.w_xaxis.set_ticklabels([])
    #ax.w_yaxis.set_ticklabels([])
    #ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('PC1', fontsize = 20)
    ax.set_ylabel('PC2', fontsize = 20)
    ax.set_zlabel('PC3', fontsize = 20)
    ax.set_title(titles[fignum - 1])
    ax.dist = 12
    fignum = fignum + 1


plt.show()