*A notebook on a naive shrinkage correction approach where we just run PCA holding 1 single sample at a time for every sample ...*

# Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import collections as mc
import pandas as pd

import pcshrink

# Data prep

In [None]:
%%time
data = pcshrink.UnpackedAncestryMap("/project/jnovembre/jhmarcus/ancient-sardinia/data/ref_genotypes/7-11-2018/lazaridis_2014/data_fil")

In [None]:
print(data.n, data.p)

Find snps that are too rare or too common

In [None]:
# use allele frequency estimator from Price et al. 2006
f = (1. + np.nansum(data.Y, axis=1)) / (2 + (2. * data.n))

# keep snps that aren't too rare or common
snp_idx = np.where((f > .02) & (f < .98))[0]
print(f.shape, snp_idx.shape)

Mean center and scale and impute missing values to 0

In [None]:
Z = data.Y[snp_idx, :]

# mean genotype 
mu = np.nanmean(Z, axis=1).reshape(len(snp_idx), 1)

# empirical std deviation
std = np.nanstd(Z, axis=1).reshape(len(snp_idx), 1)

# heterozygosity scaler
het = np.sqrt(2. * f[snp_idx] * (1. - f[snp_idx])).reshape(len(snp_idx), 1)

Z = (Z - mu) / het
Z[np.isnan(Z)] = 0.0

In [None]:
plt.scatter(het, std)
plt.xlabel("Het Scaler");
plt.ylabel("Emprical Std");

# PCA

In [None]:
%%time
k = 10
pc = pcshrink.PCShrinker(Z, k)

In [None]:
plt.scatter(pc.L[:,0], -pc.L[:,1])
plt.xlabel("PC1");
plt.ylabel("PC2");

In [None]:
L_shrunk = pc.shrink_coords(k=2, s=100, o=5)

In [None]:
L_shrunk

Plot PC1 vs PC2

In [None]:
%%time

K = 10
q = data.n
L_proj = np.empty((q, K))

for i in range(q):
    
    print(i)
    
    idx = np.ones(data.n, dtype="bool")
    idx[i] = False
    
    z = Z[:, i]
    Z_train = Z[:, idx]
    
    L_proj[i, :] = holdout_pca(z, Z_train, K)
    
#np.save("data/L_proj", L_proj)

In [None]:
#L_proj = L_proj / np.linalg.norm(L_proj, axis=0, ord=2)

In [None]:
plt.scatter(L[:, 0], -L[:,1])

In [None]:
plt.scatter(L_proj[:, 0], -L_proj[:, 1])

In [None]:
plt.figure(figsize=(12, 8))
#plt.quiver(L_proj[0,0], -L_proj[0,1], L[0,0], -L[0,1],)
plt.quiver(L[:,0], -L[:,1], L_proj[:,0]-L[:,0], -(L_proj[:,1]-L[:,1]))

plt.xlabel("PC1");
plt.ylabel("PC2");
plt.tight_layout()

In [None]:
#plt.quiver(X=L[:,0], Y=-L[:,1], U=L_proj[:,0], V=-L_proj[:,1])
#plt.xlabel("PC1");
#plt.ylabel("PC2");

lines = []
for i in range(L.shape[0]):
    lines.append([(L[i, 0], -L[i, 1]), (L_proj[i, 0], -L_proj[i, 1])])
    
lc = mc.LineCollection(lines, linewidths=2, colors="black")
fig, ax = plt.subplots(figsize=(12, 8))
ax.add_collection(lc)
ax.autoscale()

plt.xlabel("PC1")
plt.ylabel("PC2")

In [None]:
np.random.choice(Z.shape[0], 10, replace=False)