In [8]:
import pandas as pd


def combine_reduntant_columns(df, column):
    combined_columns = df[column].bfill(axis=1).iloc[:, 0]
    new_df = df.drop(column, axis=1)
    new_df[column] = combined_columns
    return new_df

# read in phenotypes from CSV file
df = pd.read_csv('../../datasets/OpenSNP/phenotypes.csv',
                 delimiter=';', na_values=['-','rather not say'], low_memory=False)

# clean column names
df.columns = map(str.lower, df.columns)
df.columns = map(lambda column: column.replace(' ', '_'), df.columns)

# combine duplicate columns into one
df = combine_reduntant_columns(df, 'hair_color')

# lowercase hair colors
df['hair_color'] = df['hair_color'].str.lower()

# drop rows with missing hair colors
df.dropna(subset=['hair_color'], inplace=True)

# drop exome-vcf files
df = df[df.apply(lambda row: 'exome-vcf' not in row['genotype_filename'], axis=1)]

# convert DOB years to integers
df['date_of_birth'] = df['date_of_birth'].astype('Int64')

In [9]:
# figure out the reference builds being used by each file

reference_builds = []

def get_file_type_and_name(row):
    row = row.fillna('unknown')
    user_id, file_type, file_id = row['genotype_filename'].split('.')
    yob = row['date_of_birth']
    sex = row['chrom_sex']
    file_name = f'user{user_id}_file{file_id}_yearofbirth_{yob}_sex_{sex}.{file_type}.txt'
    return file_type, file_name


def read_23andme(file_name):
    try:
        # read the file!
        return '?'
    except UnicodeDecodeError:
        return '?'

def read_ftdna_illumina(file_name):
    try:
        # read the file!
        return '?'
    except UnicodeDecodeError:
        return '?'

def read_ancestry(file_name):
    try:
        with open('../../datasets/OpenSNP/' + file_name) as input_file:
            head = [next(input_file) for _ in range(18)]
        return head[15].split()[7]
    except UnicodeDecodeError:
        return '?'


for index, row in df.iterrows():
    file_type, file_name = get_file_type_and_name(row)

    if file_type == '23andme':
        reference_builds.append(read_23andme(file_name))
    elif file_type == 'ancestry':
        reference_builds.append(read_ancestry(file_name))
    elif file_type == 'ftdna-illumina':
        reference_builds.append(read_ftdna_illumina(file_name))
    else:
        reference_builds.append('?')

df['reference_build'] = reference_builds

In [10]:
df.loc[:, ('user_id', 'genotype_filename', 'date_of_birth', 'chrom_sex', 'hair_color', 'reference_build')]

Unnamed: 0,user_id,genotype_filename,date_of_birth,chrom_sex,hair_color,reference_build
0,1,1.23andme.9,1985,XY,blonde,?
1,1,1.23andme.2995,1985,XY,blonde,?
3,1,1.23andme.5299,1985,XY,blonde,?
5,8,8.23andme.2,,XX,brown,?
6,10,10.23andme.3,1982,XY,brown,?
...,...,...,...,...,...,...
6865,12123,12123.ancestry.9981,,,light brown,37.1
6902,12200,12200.ftdna-illumina.10050,,XY,"blonde as a child, to brown as an adult",?
6915,12223,12223.23andme.10068,,,black,?
6918,12226,12226.23andme.10072,1980,XY,black,?


In [11]:
# reconcile hair colors into one of three categories: blonde, brown, or black

blonde = {'blonde', 'blond', 'dirt-blonde'}
brown = {'brown', 'medium brown', 'dark brown', 'auburn', 'dirt-brown', 'light brown',
         'blond born, today dark brown', 'light to medium brown', 'auburn (reddish-brown)'}
black = {'black', 'brown-black'}

for i, hair_color in enumerate(df['hair_color'].unique()):
    label = '      '
    if hair_color in blonde:
        label = 'blonde'
    elif hair_color in brown:
        label = 'brown '
    elif hair_color in black:
        label = 'black '


    num_samples = len(df[df['hair_color'] == hair_color])
    print(i, '\t', label, '\t', num_samples, '\t', hair_color)

0 	 blonde 	 59 	 blonde
1 	 brown  	 181 	 brown
2 	 black  	 72 	 brown-black
3 	 brown  	 44 	 light to medium brown
4 	        	 2 	 strawberry brown
5 	 brown  	 26 	 auburn (reddish-brown)
6 	 brown  	 6 	 dirt-brown
7 	 black  	 71 	 black
8 	 brown  	 156 	 dark brown
9 	        	 4 	 brown going to white in early 40s
10 	 blonde 	 26 	 dirt-blonde
11 	 brown  	 37 	 light brown
12 	        	 4 	 cc
13 	 brown  	 28 	 blond born, today dark brown
14 	 brown  	 9 	 auburn
15 	 brown  	 35 	 medium brown
16 	        	 24 	 dark brown; red highlights
17 	        	 12 	 dirty blonde, light brown, something?
18 	        	 24 	 red
19 	        	 8 	 medium golden brown
20 	        	 15 	 strawberry blonde
21 	        	 38 	 dark blonde (light brown)
22 	        	 10 	 light ashy brown
23 	        	 3 	 blackish brown
24 	        	 11 	 blonde as a child, to brown as an adult
25 	 blonde 	 7 	 blond
26 	        	 1 	 dark brown; blonde highlights
27 	        	 2 	 red (gone blond-grey

In [12]:
# number of blonde samples
len(df[df['hair_color'].str.lower().isin(blonde)])

92

In [13]:
# number of brown samples
len(df[df['hair_color'].str.lower().isin(brown)])

522

In [14]:
# number of black samples
len(df[df['hair_color'].str.lower().isin(black)])

143