In [106]:
family_members = pd.read_csv("/Users/zkhan/Dropbox/FIW_Video/family_members.csv")
families_processed = pd.read_csv("/Users/zkhan/Dropbox/FIW_Video/families_processed.csv")
fiwdb = Path("/Users/zkhan/master-version/fiwdb/FIDs/")

# How many distinct individuals per family?

In [107]:
members_per_family = family_members.\
dropna(subset=["video"], how='any').\
groupby('fid').count().\
rename({'surname': 'num_members'}, axis=1)['num_members'].\
to_frame()

members_per_family.to_csv('./members_per_family.csv')

members_per_family.head()

Unnamed: 0_level_0,num_members
fid,Unnamed: 1_level_1
F0008,2
F0009,7
F0011,4
F0012,3
F0013,2


# How many unique videos per family?

In [108]:
vids_per_family = pd.DataFrame(columns=['fid', 'surname', 'num_videos']).set_index('fid')
for (fid, surname), family in family_members.dropna(subset=['video'], how='any').groupby(['fid', 'surname']):
    videos_for_family = {url for url in  
        itertools.chain(family['video'].dropna().values,
                        family['video2'].dropna().values,
                        family['video3'].dropna().values,
                       )
    }
    vids_per_family.loc[fid, 'surname'] = surname
    vids_per_family.loc[fid, 'num_videos'] = len(videos_for_family)
    
vids_per_family.to_csv('./videos_per_family.csv')

In [109]:
vids_per_family.head()

Unnamed: 0_level_0,surname,num_videos
fid,Unnamed: 1_level_1,Unnamed: 2_level_1
F0008,affleck.ben,4
F0009,gronkowski.rob,7
F0011,aguilera.christina,2
F0012,aikman.troy,3
F0013,al-assad.bashar,4


# How many pairs are possible?

In [110]:
kin_pairs = [] 
for (fid, surname), family in family_members.dropna(subset=['video'], how='any').groupby(['fid', 'surname']):
    pairs_for_family = itertools.combinations(family['mid'].values, r=2)
    for (p1_mid, p2_mid) in pairs_for_family:
        
        kin_pairs.append((
            surname,
            f'{fid}/MID{p1_mid}',
            f'{fid}/MID{p2_mid}',
        ))
        
kin_pairs = pd.DataFrame.from_records(kin_pairs, columns=['surname', 'p1', 'p2'])

In [111]:
kin_pairs

Unnamed: 0,surname,p1,p2
0,affleck.ben,F0008/MID1,F0008/MID4
1,gronkowski.rob,F0009/MID1,F0009/MID2
2,gronkowski.rob,F0009/MID1,F0009/MID3
3,gronkowski.rob,F0009/MID1,F0009/MID4
4,gronkowski.rob,F0009/MID1,F0009/MID5
...,...,...,...
706,sweden.royal.family,F0986/MID5,F0986/MID7
707,sweden.royal.family,F0986/MID5,F0986/MID8
708,sweden.royal.family,F0986/MID6,F0986/MID7
709,sweden.royal.family,F0986/MID6,F0986/MID8


# What are the counts of each relationship type?

In [112]:
rid = pd.read_csv("/Users/zkhan/master-version/fiwdb/FIW_RIDs.csv").set_index("RID").dropna().to_dict()["Label"]

In [113]:
relmats = {folder.stem: pd.read_csv(folder / "mid.csv").set_index("MID") for folder in fiwdb.glob("F*")}

In [114]:
def proper_relationship(p1, p2, relmats, rid):
    fid1, mid1, *rest = p1.split("/")
    fid2, mid2, *rest = p2.split("/")
    mid1 = int(mid1.split("MID")[-1])
    mid2 = int(mid2.split("MID")[-1])
    if fid1 != fid2:
        return "NOT_RELATED"
    
    matrix = relmats[fid1]
    p1_gender = matrix.loc[mid1, "Gender"][0]
    p2_gender = matrix.loc[mid2, "Gender"][0]
    p1_male = p1_gender == "m"
    p2_male = p2_gender == "m"
    
    rel_idx = matrix.loc[mid1][str(mid2)]
    
    if rel_idx not in rid:
        return 'NOT_RELATED'
    try:
        rel = rid[rel_idx]
    except:
        print(matrix)
        raise
        
    if rel == "Child":
        p1_role = "son" if p1_male else "daughter"
        p2_role = "father" if p2_male else "mother"
    elif rel == "Parent":
        p1_role = "father" if p1_male  else "mother"
        p2_role = "son" if p2_male else "daughter"
    elif rel == "Grandparent":
        p1_role = "grandfather" if p1_male else "grandmother"
        p2_role = "grandson" if p2_male else "granddaughter"
    elif rel == "Grandchild":
        p1_role = "grandson" if p1_male else "granddaughter"
        p2_role = "grandfather" if p2_male else "grandmother"
    elif rel == "Sibling":
        if p1_male and p2_male:
            p1_role, p2_role = "brother", "brother"
        elif (not p1_male) and (not p2_male):
            p1_role, p2_role = "sister", "sister"
        else:
            p1_role, p2_role = "sibling", "sibling"
    elif rel == 'Spouse':
        p1_role, p2_role = 'spouse', 'spouse'
    elif rel == "Great Grandparent":
        p1_role = "greatgrandfather" if p1_male else "greatgrandmother"
        p2_role = "greatgrandson" if p2_male else "greatgranddaughter"
    else:
        print(rel)
        p1_role, p2_role = 'unknown', ''
    return "-".join(sorted([p1_role, p2_role]))

In [115]:
proper_roles = []
for row in kin_pairs.itertuples():
    try:
        proper_roles.append(proper_relationship(row.p1, row.p2, relmats, rid))
    except:
        print(row)
        break

In [116]:
kin_pairs["ptype"] = pd.Series(proper_roles)

In [117]:
pair_type_counts = kin_pairs["ptype"].value_counts().to_frame().rename({'ptype': 'num_pairs'}, axis=1)
pair_type_counts.index.name = 'ptype'
pair_type_counts.to_csv('./pair_type_counts.csv')
pair_type_counts

Unnamed: 0_level_0,num_pairs
ptype,Unnamed: 1_level_1
NOT_RELATED,126
daughter-father,94
father-son,90
brother-brother,80
mother-son,73
spouse-spouse,67
daughter-mother,57
sibling-sibling,45
sister-sister,43
grandfather-grandson,9


In [118]:
kin_pairs.to_csv('./kin_pairs.csv', index=False)

# Number of videos per relationship type

In [119]:
def get_clips_for_person(person, family_members):
    """
    Get clips for an individual.
    
    person: 
        A string like 'F0008/MID1'.
    """
    
    fid = person.split('/')[0]
    mid = int(person.split('/')[-1].split('MID')[-1])
    row = family_members[family_members.fid.eq(fid) & family_members.mid.eq(mid)].iloc[0]
    videos = (row.video, row.video2, row.video3)
    videos = list(_ for _ in videos if not pd.isna(_))
    return videos

In [120]:
def count_clips_for_pair(p1, p2, family_members) -> int:
    p1_clips = set(get_clips_for_person(p1, family_members))
    p2_clips = set(get_clips_for_person(p2, family_members))
    common_clips = p1_clips & p2_clips
    p1_clips = p1_clips - common_clips
    p2_clips = p2_clips - common_clips
    return len(p1_clips) * len(p2_clips) + len(common_clips)

In [121]:
clips_for_kin_pair = []
for pair in kin_pairs.itertuples():
    clips_for_kin_pair.append(
        count_clips_for_pair(pair.p1, pair.p2, family_members)
    )
kin_pairs['clips_for_pair'] = pd.Series(clips_for_kin_pair)

In [122]:
clips_for_pair = kin_pairs.groupby('ptype').agg({'clips_for_pair': 'sum'})

In [123]:
clips_for_pair.to_csv('./clips_for_pair_types.csv')

In [124]:
clips_for_pair

Unnamed: 0_level_0,clips_for_pair
ptype,Unnamed: 1_level_1
NOT_RELATED,156
brother-brother,120
daughter-father,153
daughter-mother,145
father-son,129
granddaughter-grandfather,14
granddaughter-grandmother,20
grandfather-grandson,12
grandmother-grandson,7
greatgranddaughter-greatgrandfather,2


# How many subjects share a video?

In [73]:
url_counts = pd.Series(
    list(family_members.video.dropna().values) + 
    list(family_members.video2.dropna().values) + 
    list(family_members.video3.dropna().values),
).value_counts()

urls_shared_counts = pd.Series(url_counts.values).value_counts()

In [74]:
url_shared_counts = urls_shared_counts.to_frame().sort_index()
url_shared_counts.index.name = 'num_members'
url_shared_counts = url_shared_counts.rename({0: 'num_videos'}, axis=1)
url_shared_counts

Unnamed: 0_level_0,num_videos
num_members,Unnamed: 1_level_1
1,568
2,19
3,5
4,1
7,1


In [75]:
url_shared_counts.to_csv('num_videos_with_k_members.csv')

# What are the ethnicities of the subjects?

In [76]:
family_members_eth = family_members.dropna(how='any', subset=['ethnicity', 'video'])

In [77]:
family_members_eth.ethnicity.value_counts()

white                345
african               57
jewish                19
middle-east           14
asian                 11
white-middle-east     10
white-jewish           8
latino                 7
white-african          4
african-jewish         3
asian-white            3
white-latino           3
indian                 2
Name: ethnicity, dtype: int64

In [82]:
eth_counts = family_members_eth\
.ethnicity.value_counts()\
.to_frame().reset_index()\
.rename({"index": "ethnicity", "ethnicity": "count"}, axis=1)

In [83]:
eth_counts

Unnamed: 0,ethnicity,count
0,white,345
1,african,57
2,jewish,19
3,middle-east,14
4,asian,11
5,white-middle-east,10
6,white-jewish,8
7,latino,7
8,white-african,4
9,african-jewish,3


In [84]:
eth_counts.to_csv("./ethnicity_counts.csv", index=False)