In [None]:
import pandas as pd
from collections import defaultdict

In [None]:
f_kinship = ''
f_sample_meta = ''
ls_breed_5 = ['Angus', 'Hereford', 'Charolais', 'Limousin', 'Simmental']


In [None]:
# Load kinship coefficients calculated between two individuals using the KING software
df_kin = pd.read_csv(f_kinship, 
                     sep='\t', 
                     usecols={'ID1', 'ID2', 'Kinship'}, 
                     dtype={'ID1': str, 'ID2': str, 'Kinship': float}
                    )

In [None]:
df_kin_related = df_kin.query('Kinship >= 0.0442')

In [None]:
# Load meta-information of samples
df_meta = pd.read_csv(f_sample_meta)

In [None]:
# Filter samples belonging to five beef cattle breeds
df_meta_5_beef = df_meta.query('breed in @ls_breed_5')

In [None]:
st_5_beef = set(df_meta_5_beef['SampleID'].tolist())

In [None]:
# Filter individuals that are not unrelated
df_kin_5_beef_related = df_kin_related.query('(Kinship >= 0.0442) & (ID1 in @st_5_beef) & (ID2 in @st_5_beef)')

In [None]:
# Calculate the number of non-unrelated individuals for each individual
beef_ind2num_relate = {}
for sample in st_5_beef:
    beef_ind2num_relate[sample] = df_kin_5_beef_related.query('(ID1 == @sample) | (ID2 == @sample)').shape[0]

In [None]:
# Sort individuals in descending order based on the number of non-unrelated individuals
beef_ind2num_relate_order=sorted(beef_ind2num_relate.items(),key=lambda x:x[1],reverse=True) 

In [None]:
# Iteratively remove individuals with the highest number of non-unrelated relationships until all remaining individuals are unrelated
df_temp = df_kin_5_beef_related.copy(deep=True)
st_dele_id = set()
for tp in beef_ind2num_relate_order:
    sample = tp[0]
    if df_temp.query('(ID1 == @sample) | (ID2 == @sample)').shape[0] != 0:
        df_temp = df_temp.query('(ID1 != @sample) & (ID2 != @sample)')
        st_dele_id.add(sample)
    else:
        pass

In [None]:
df_5_beef_unrelate = df_meta_5_beef.query('SampleID not in @st_dele_id')

## select individuals

In [None]:
breed2ls_ind = defaultdict(list)

### For Angus, Hereford, and Charolais, the number of unrelated individuals is more than 100

In [None]:
# Select the top 100 individuals with the highest sequencing coverage
for breed in ['Angus', 'Hereford', 'Charolais']:
    breed2ls_ind[breed] = df_5_beef_unrelate.query('breed == "Angus"').sort_values('Coverage', ascending=False).iloc[:100, 0].tolist()

### For Simmental

In [None]:
breed = 'Simmental'

In [None]:
# Obtain all unrelated individuals with sequencing coverage greater than 5×, totaling 81 individuals
breed2ls_ind[breed] = df_5_beef_unrelate.query('(breed == "Simmental") & (Coverage >= 5)').sort_values('Coverage', ascending=False)['SampleID'].tolist()

In [None]:
# Add 19 unrelated German Simmental individuals
ls_nice_fleckvieh = df_meta.query('(breed == "Fleckvieh") & (Coverage >= 7)').sort_values('Coverage', ascending=False)['SampleID'].to_list()
df_relate_fleckvieh = df_kin.query('(ID1 in @ls_nice_fleckvieh) & (ID2 in @ls_nice_fleckvieh) & (Kinship >= 0.0442)')
st_relate_fleckvieh = set()
for _, row in df_relate_fleckvieh.iterrows():
    st_relate_fleckvieh.add(row['ID1'])
    st_relate_fleckvieh.add(row['ID2'])
ls_unrelate_fleckvieh = list(set(ls_nice_fleckvieh) - st_relate_fleckvieh)
ls_19_fleckvieh = df_meta.query('SampleID in @ls_unrelate_fleckvieh').sort_values(by='Coverage', ascending=False).iloc[:19, 0].to_list()

### For Limousin

In [None]:
breed2ls_ind['Limousin'] = df_meta.query('breed == "Limousin"').sort_values('Coverage', ascending=False).iloc[:100, 0].to_list()