### Importing modules

In [23]:
import pandas as pd
import numpy as np

### Loading data

In [10]:
bam = pd.read_table("../data/merged_bams/bams3.txt", header=None)
bam.columns = ['Sample_ID']
bam['Sample_ID'] = bam['Sample_ID'].apply(lambda x: x.split('/')[-1].split('.')[0])

geno_df = pd.read_table("../data/genotypes/geno4.geno", header=None)
geno_df = geno_df.drop([0, 1, len(geno_df.columns)-1], axis=1).T
geno_df.index = range(len(geno_df))
geno_df_w_ID = bam.merge(geno_df, left_index=True, right_index=True)

meta_df = pd.read_csv('../config/meta.csv')
geno_df_w_ID = pd.merge(geno_df_w_ID, meta_df, left_on='Sample_ID',right_on='Sample_ID')
geno_df_w_ID

Unnamed: 0,Sample_ID,0,1,2,3,4,5,6,7,8,...,2956,2957,2958,latitude,longitude,is_replicate,replicate,library,barcode,id
0,ucin001,TT,TT,AA,CC,AA,AA,AA,AA,NN,...,TT,CC,AA,39.143365,-92.695926,False,,14,ACCA,ucin001
1,ucin002,TT,TT,AA,AA,AA,AA,AA,TT,NN,...,TT,NN,AA,37.885845,-90.161531,False,,15,ACCA,ucin002
2,ucin003,TT,TT,AA,AA,AA,AA,AA,TT,GG,...,TT,CC,AA,37.824835,-92.207022,False,,16,ACCA,ucin003
3,ucin004conc,TT,TT,AA,AC,AA,AG,AA,TT,GG,...,TT,CC,AA,38.935162,-91.465437,False,,9,AGTG,ucin004conc
4,ucin005,TG,TT,AA,AA,AA,AA,AA,TT,GG,...,TT,CC,AA,37.885845,-90.161531,False,,10,AGTG,ucin005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,ucin433,TT,TC,AA,AC,AA,AA,AA,TT,GA,...,TT,CC,AT,47.578635,-92.514570,False,,33,AGAC,ucin433
290,ucin434,TT,TT,AA,NN,AA,AA,AA,TT,GG,...,TT,NN,AA,41.002049,-93.274419,False,,33,ACCA,ucin434
291,ucin435,TT,TT,AA,AC,AC,AG,AA,TT,GA,...,TT,CC,AT,41.822100,-90.578000,False,,33,AGTG,ucin435
292,ucin436,TT,TT,AA,AA,AC,AA,AA,TT,GG,...,TT,CC,AA,43.331593,-92.120109,False,,33,CATC,ucin436


### Filtering to show replicates

In [60]:
rep_df = geno_df_w_ID.loc[(geno_df_w_ID.is_replicate) | ( ~geno_df_w_ID.replicate.isna())].iloc[:,0:2960]
rep_df

Unnamed: 0,Sample_ID,0,1,2,3,4,5,6,7,8,...,2949,2950,2951,2952,2953,2954,2955,2956,2957,2958
20,ucin024,TT,TT,AA,AA,AA,AA,AA,AA,NN,...,GG,AA,AA,GG,CC,CC,NN,TT,CC,AA
21,ucin024replicate,TG,TT,AA,AA,AA,AA,AA,AA,NN,...,GG,AA,AA,GG,CC,CC,NN,TT,CC,AA
168,ucin236,TT,TT,AA,AA,AA,AA,AC,AA,GG,...,GG,AA,AA,GG,CC,CC,AA,TT,CC,AA
169,ucin236replicate,TT,TT,AA,AA,AA,AA,AA,AA,GG,...,GG,AA,AA,GG,CC,CC,AA,TT,CC,AA
188,ucin261,TT,TT,AA,AA,AA,AA,AA,TT,NN,...,GG,AA,AA,NN,CC,CT,NN,NN,NN,AA
205,ucin279conc,TG,TT,AA,AA,AA,AA,AA,AA,NN,...,GG,AA,AA,GG,CC,CT,AA,TT,NN,AA
206,ucin279concreplicate,TT,TT,AA,AA,AA,AA,AA,AA,GA,...,GG,AA,AA,GG,CC,CT,AA,TT,CT,AA
273,ucin413,TT,TT,AA,AA,AA,AA,AA,AT,AA,...,GT,AA,AA,GG,CC,CT,AA,TT,CC,AA
274,ucin413replicate,TT,TT,AA,AA,AA,AA,AA,AT,AA,...,GT,AA,AA,GG,CC,CT,AA,TT,CC,AA


### Similarity Matrix

In [70]:
def is_same_base(base1, base2):
    if base1 == 'NN' or base2 == 'NN':
        return np.nan
    else:
        return int(base1 == base2)
    
def percent_matching(row1, row2):
    return np.nanmean([is_same_base(x,y) for x,y in zip(row1, row2)])

In [71]:
similarity_matrix = pd.DataFrame(index=rep_df.Sample_ID.values, columns=rep_df.Sample_ID.values)
for i in rep_df.Sample_ID:
    for j in rep_df.Sample_ID:
        if i == j:
            similarity_matrix.loc[i,j] = 1
        else:
            similarity_matrix.loc[i,j] = percent_matching(rep_df.loc[rep_df.Sample_ID == i].iloc[:,1:].values[0],
                                                      rep_df.loc[rep_df.Sample_ID == j].iloc[:,1:].values[0])
similarity_matrix

Unnamed: 0,ucin024,ucin024replicate,ucin236,ucin236replicate,ucin261,ucin279conc,ucin279concreplicate,ucin413,ucin413replicate
ucin024,1.0,0.958771,0.698057,0.695054,0.702083,0.696305,0.700776,0.701931,0.698001
ucin024replicate,0.958771,1.0,0.708586,0.698261,0.698028,0.696071,0.704722,0.702206,0.700601
ucin236,0.698057,0.708586,1.0,0.955141,0.715104,0.711715,0.717241,0.692446,0.701767
ucin236replicate,0.695054,0.698261,0.955141,1.0,0.720978,0.710757,0.711783,0.686717,0.698276
ucin261,0.702083,0.698028,0.715104,0.720978,1.0,0.708351,0.705835,0.70913,0.721365
ucin279conc,0.696305,0.696071,0.711715,0.710757,0.708351,1.0,0.966469,0.731072,0.736508
ucin279concreplicate,0.700776,0.704722,0.717241,0.711783,0.705835,0.966469,1.0,0.73452,0.729545
ucin413,0.701931,0.702206,0.692446,0.686717,0.70913,0.731072,0.73452,1.0,0.956985
ucin413replicate,0.698001,0.700601,0.701767,0.698276,0.721365,0.736508,0.729545,0.956985,1.0


### Table format

|Sample|Replicate Sample| Percentage of SNPs with equal genotypes|
|------|----------------|---------------------------------------|
|ucin024|ucin024replicate|95.9%|
|ucin236|ucin236replicate|95.5%|
|ucin279conc|ucin279concreplicate|96.6%|
|ucin413	|ucin413replicate|95.7%|
|ucin261| Replicate did not pass filtering | NA |
