In [37]:
import numpy as np
import pandas as pd
from scipy import stats

In [208]:
E = pd.DataFrame([0.0739, 0.396032, 0.5299])*500
O = pd.DataFrame([0.17, 0.204, 0.626])*500

E.sum()

0    499.916
dtype: float64

In [209]:
"""
Gets two pandas dataframes - Expected (E) and Obeserved (O), and optional condifence_level=0.05 (p=0.05 <=> 0.05)
Returns the Chi-squared statistics for E and O, the Chi-squared critical value for the given confidence level,
and if the Chi-squared statistics is higher than the critical level (i.e., if the null hypothesis should be rejected).
"""

def pd_chi_sq(E, O, confidence_level=0.05):
    result = ((O-E).apply(lambda x: x**2)/E).sum()
    chi_level = stats.chi2.ppf(1-confidence_level, len(O)-1)
    return result.sum(), chi_level, result.sum() > chi_level

In [210]:
pd_chi_sq(E, O)

(117.7558213156951, 5.991464547107979, True)

In [232]:
"""
Get genotype frequencies for the "A" frequency p and "a" frequency 1-p
"""
def get_genotype_frequencies(p):
    return p**2, 2*p*(1-p), (1-p)**2


In [231]:
"""
P = number of AA homozygotes at G0
H = number of Aa heterozygotes at G0
Q = number of aa homozygotes at G0
num_gen = number of generations (1 by default)
"""
def get_self_pollination(P, H, Q, num_gen=1):
    if num_gen == 0: 
        return P, H, Q
    else:
        P_c, H_c, Q_c = get_self_pollination(P, H, Q, num_gen-1)
        P_c += H_c/4
        Q_c += H_c/4
        H_c = H_c/2
        return P_c, H_c, Q_c

In [238]:
a = np.array(get_self_pollination(*(0.3*np.array(get_genotype_frequencies(0.6))))) + 0.7*np.array(get_genotype_frequencies(0.6))

In [240]:
sum(a)
print(a)

[0.396 0.408 0.196]


In [241]:
get_genotype_frequencies(0.6)

(0.36, 0.48, 0.16000000000000003)

In [242]:
1/(2**8)

0.00390625

In [290]:
data = pd.read_table('gt_data.tsv',header=None)

In [294]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,58,59,60,61,62,63,64,65,66,67
0,rs2234968,0/0,0/1,0/0,0/0,0/0,1/1,0/0,1/1,0/0,...,0/0,0/0,0/1,0/0,0/1,0/1,0/1,0/1,0/0,0/1
1,rs2229094,0/0,0/0,0/1,0/1,0/1,0/1,0/0,0/0,0/0,...,0/1,0/0,0/0,0/1,0/1,0/1,0/1,0/0,0/0,0/1
2,rs10829163,0/0,0/1,0/0,0/0,0/1,0/0,0/1,1/1,0/1,...,0/1,0/0,0/1,0/1,0/0,0/0,1/1,1/1,0/1,0/0
3,rs10911825,0/1,0/1,0/0,0/0,0/0,0/1,0/1,0/0,0/0,...,0/0,0/1,0/0,0/1,0/0,0/1,0/1,0/0,0/1,0/0
4,rs11568351,0/1,0/1,0/1,0/0,0/1,0/1,1/1,0/0,1/1,...,0/0,0/0,0/0,0/0,0/0,1/1,0/1,0/1,1/1,0/1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,rs11778026,0/1,0/1,0/0,0/0,0/1,0/1,1/1,0/0,0/1,...,0/0,0/0,0/0,0/0,0/1,0/1,1/1,1/1,0/0,1/1
96,rs9474143,0/0,1/1,0/0,0/1,0/0,0/1,0/0,0/0,0/0,...,0/1,0/0,0/1,0/1,0/0,0/1,0/0,0/1,0/0,0/0
97,rs35935937,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,...,0/0,1/1,0/1,0/0,0/0,0/0,0/0,0/1,0/0,0/0
98,rs73343757,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,...,0/1,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/1,0/0


In [310]:
data.iloc[:, 2].value_counts()

0/0    63
0/1    34
1/1     3
Name: 2, dtype: int64

In [487]:
"""
Input: df: pandas series with rows corresponding to SNPs, values '0/0', '0/1', '1/1', denoting homo- and
heterozygotes (0 for the dominant allele, 1 for the recessive). 
Output: expected Hardy-Weinbgerg distribution for the given number of alleles
"""
def expected_dist(df: pd.DataFrame) -> tuple:
    p = 2*df.value_counts().get('0/0', 0) + df.value_counts().get('0/1', 0)
    q = 2*df.value_counts().get('1/1', 0) + df.value_counts().get('0/1', 0)
    summa = p + q
    return p**2/(2*summa), p*q/summa, q**2/(2*summa)

In [614]:
data.iloc[99, 1:]

1     0/0
2     0/0
3     0/0
4     0/1
5     0/0
     ... 
63    0/0
64    0/0
65    0/1
66    0/0
67    0/1
Name: 99, Length: 67, dtype: object

In [616]:
a = [1-data.iloc[i, 1:].value_counts().get('0/1',0)/expected_dist(data.iloc[i, 1:])[1] for i in range(1, 99)]

In [624]:
a

[-0.19884559884559883,
 0.055128205128205154,
 -0.21818181818181825,
 0.04495614035087725,
 -0.08550855085508546,
 -0.03266055045871563,
 -0.03266055045871563,
 -0.17543859649122795,
 0.045584045584045496,
 0.080091533180778,
 0.3516129032258064,
 -0.14529914529914523,
 -0.10352941176470587,
 -0.15517241379310343,
 -0.005100510051005092,
 0.0027418723070896656,
 0.2298850574712643,
 -0.005100510051005092,
 0.13123183545718753,
 -0.17543859649122795,
 -0.1254940711462451,
 0.02110389610389607,
 -0.1394557823129252,
 0.1015325670498084,
 0.014705882352941124,
 -0.2761904761904761,
 0.2518883415435139,
 -0.04256292906178483,
 0.007407407407407307,
 -0.10016420361247946,
 -0.03266055045871563,
 -0.04256292906178483,
 0.026162790697674465,
 0.0006779661016949046,
 -0.10973084886128359,
 0.0983436853002071,
 -0.04256292906178483,
 0.1015325670498084,
 -0.015896307165566226,
 0.080091533180778,
 0.05411764705882349,
 0.178921568627451,
 0.03476354525524583,
 0.02898550724637683,
 0.0846994535

In [625]:
import numpy as np
np.argmin(a)

89

In [630]:
data.iloc[90]

0     rs3738815
1           0/1
2           0/1
3           0/1
4           0/1
        ...    
63          0/1
64          0/1
65          0/1
66          0/1
67          0/1
Name: 90, Length: 68, dtype: object

In [490]:
def inbreeding_coefficient_F(df):
    return 1 - (df.value_counts().get('0/1', 0)/expected_dist(df)[1])

In [492]:
data.columns

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
            67],
           dtype='int64')

In [494]:
for df in data:
    print(df)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67


In [505]:
Fs = [inbreeding_coefficient_F(data[df]) for df in range(1, len(data.columns))]


In [539]:
inbreeding_coefficient_F(data[60])

-0.11482720178372352

In [516]:
a = sorted(Fs)

In [517]:
a

[-0.1428571428571428,
 -0.1403142817410652,
 -0.11827956989247301,
 -0.11827956989247301,
 -0.11482720178372352,
 -0.11482720178372352,
 -0.10461338531513964,
 -0.10383776151970237,
 -0.09649122807017552,
 -0.09649122807017552,
 -0.09613390608798689,
 -0.09434587962195318,
 -0.09434587962195318,
 -0.07908935386235028,
 -0.07908935386235028,
 -0.0625,
 -0.055085265611581224,
 -0.054196513042302996,
 -0.054196513042302996,
 -0.03963612735542554,
 -0.03950103950103956,
 -0.03688342467782535,
 -0.03688342467782535,
 -0.029064107912668558,
 -0.028213166144200663,
 -0.02471368294153109,
 -0.01637492941840768,
 -0.01637492941840768,
 -0.0043290043290042934,
 0.007795889440113335,
 0.009324009324009341,
 0.012474012474012364,
 0.025341130604288553,
 0.025341130604288553,
 0.02865468674113647,
 0.04009034443817061,
 0.048933885565270674,
 0.048933885565270674,
 0.05018611218072133,
 0.051490514905149,
 0.053763440860215006,
 0.053763440860215006,
 0.053763440860215006,
 0.05392620624408695,
 0.

In [375]:
sum(data.iloc[:, 1].value_counts().values)

100

In [328]:
data.iloc[1, :].value_counts(), expected_dist(data.iloc[1, :])
Fs = [inbreeding_coefficient_F(data.iloc[i, :]) for i in range(len(data.columns))]
print(Fs)

[-0.15238095238095228, -4.479797979797979, -1.0448717948717947, -2.4, -0.7152777777777779, -2.267326732673267, -2.6926605504587156, -2.6926605504587156, -inf, -1.407407407407407, -1.6304347826086958, -0.048387096774193505, -inf, -2.3911111111111114, -inf, -1.3308580858085808, -2.3378378378378377, -0.4367816091954022, -1.3308580858085808, 0.02292184479425574, -2.8888888888888893, -1.275252525252525, -2.1607142857142856, -2.639455782312925, -1.4540229885057472, -1.2352941176470589, -inf, -0.10634920634920642, -4.814492753623188, -0.36969696969696964, -3.404761904761905, -2.6926605504587156, -4.814492753623188, -0.4599483204134367, -0.2861538461538462, -1.4811594202898553, -0.3659420289855071, -4.814492753623188, -1.4540229885057472, -0.6479885057471264, -1.6304347826086958, -0.8400000000000001, -0.30718954248366015, -1.1399676375404533, -0.6376811594202898, -2.4153005464480874, 0.13884472708002116, -2.3911111111111114, -0.13761467889908263, -0.4367816091954022, -inf, -2.1607142857142856,

  return 1 - (df.value_counts().get('0/1', 0)/expected_dist(df)[1])
  return 1 - (df.value_counts().get('0/1', 0)/expected_dist(df)[1])
  return 1 - (df.value_counts().get('0/1', 0)/expected_dist(df)[1])
  return 1 - (df.value_counts().get('0/1', 0)/expected_dist(df)[1])
  return 1 - (df.value_counts().get('0/1', 0)/expected_dist(df)[1])
  return 1 - (df.value_counts().get('0/1', 0)/expected_dist(df)[1])
