In [2]:
import numpy as np
import pandas as pd
import sys, os
from collections import Counter, defaultdict

In [3]:
proband_regions_file = '/Users/dnelson/project/anc_finder/data/BALSAC_proband_regions.txt'

In [4]:
df = pd.read_csv(proband_regions_file, sep='\t', header=None, names=['proband', 'region'], skipinitialspace=True)

In [5]:
Counter(df['region'].values)

Counter({nan: 2947,
         'COTE NORD': 12810,
         'SAGUENAY (LAC ST JEAN)': 219559,
         'BAS SAINT LAURENT': 61963,
         'CHARLEVOIX': 12503,
         'COTE DU SUD': 36496,
         'REGION DE QUEBEC': 41299,
         'QUEBEC (AGGLOMERATION)': 94750,
         'ABITIBI': 21826,
         'LANAUDIERE': 36037,
         'OUTAOUAIS': 50992,
         'ILES DE LA MADELEINE': 3039,
         'BOIS FRANCS': 69938,
         'RIVE NORD OUEST (MTL)': 25623,
         'BEAUCE': 33418,
         'COTE DE BEAUPRE': 7859,
         'GASPESIE': 25942,
         'ESTRIE': 88498,
         'TEMISCAMINGUE': 15469,
         'ILE DE MONTREAL': 437729,
         'MAURICIE': 74627,
         'RESTE DU QUEBEC': 1491,
         'RIVE SUD (MTL)': 31182,
         'LAURENTIDES': 30932,
         'RICHELIEU': 67221})

In [6]:
region_counts = Counter(df['region'].values)

In [7]:
total_samples = 1000
total_num_probands = df.shape[0]
region_target_num_samples = {k: (v / total_num_probands) * total_samples for k, v in region_counts.items()}
region_target_num_samples[np.nan] = 0

In [8]:
region_target_num_samples

{nan: 0,
 'COTE NORD': 8.516437855267094,
 'SAGUENAY (LAC ST JEAN)': 145.9688195991091,
 'BAS SAINT LAURENT': 41.19469467805738,
 'CHARLEVOIX': 8.31233587075757,
 'COTE DU SUD': 24.263537546122397,
 'REGION DE QUEBEC': 27.456703121364228,
 'QUEBEC (AGGLOMERATION)': 62.99238772728784,
 'ABITIBI': 14.510520892198251,
 'LANAUDIERE': 23.958381810324767,
 'OUTAOUAIS': 33.9008742479141,
 'ILES DE LA MADELEINE': 2.0204101984509526,
 'BOIS FRANCS': 46.49669248412725,
 'RIVE NORD OUEST (MTL)': 17.03487019246751,
 'BEAUCE': 22.21719908253831,
 'COTE DE BEAUPRE': 5.224877837981585,
 'GASPESIE': 17.2469501047103,
 'ESTRIE': 58.8358873782535,
 'TEMISCAMINGUE': 10.28421367549779,
 'ILE DE MONTREAL': 291.0141940630921,
 'MAURICIE': 49.61406774590301,
 'RESTE DU QUEBEC': 0.9912575208589568,
 'RIVE SUD (MTL)': 20.73064521490543,
 'LAURENTIDES': 20.564438387128945,
 'RICHELIEU': 44.690356679852414}

In [11]:
region_sample_count = defaultdict(int)
sampled_inds = []

# Shuffle the rows of the dataframe
df = df.sample(frac=1, replace=False).reset_index(drop=True)

for i, row in df.iterrows():    
    if i % 100000 == 0:
        print(i)
        
    if region_sample_count[row.region] >= region_target_num_samples[row.region]:
        continue

        
    sampled_inds.append(row.proband)
    region_sample_count[row.region] += 1

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000


In [12]:
len(sampled_inds)

1010

In [14]:
sample_out_file = os.path.expanduser(
    '~/project/pedigree_msp/data/BALasc_probands1930_1010probands_proportional_regions.txt')

with open(sample_out_file, 'w') as f:
    for ind in sampled_inds:
        f.write(str(ind) + '\n')