In [6]:
import numpy as np
import pandas as pd

def fill_missing_clinic_traits(name_tags,trait):
    print("Filling missing values in {}".format(trait))
    print("The name tags are {}".format(name_tags))
    # load in the clinic phenotypes
    clinic = pd.read_csv('raw_data/(F2)B6_BTBR OB mice_clinic_traits.csv', index_col=0)
    
    # load in rna expression data
    tissues = ['islet', 'liver', 'adipose', 'kidney', 'gastroc']
    indices = []
    for tissue in tissues:
        rna = pd.read_csv('cleaned/{}_rna.csv'.format(tissue), index_col=0)
        indices.append(rna.index)
    
    # get the union of all the indices
    rna_index = indices[0]
    for i in range(1, len(indices)):
        rna_index = rna_index.union(indices[i]) 
    
    # get the mouse id, conver it to string
    mouse_id = rna_index.to_list()
    mouse_id_clinic = clinic.index.to_list()
    
    # get rid of 'Mouse' at the beginning of each element of mouse id
    mouse_id = [i[5:] for i in mouse_id] 
    
    table = np.zeros((len(mouse_id), 4))
    # use mouse id and name tag to get the traits
    for i in range(len(mouse_id)):
        if mouse_id[i] not in mouse_id_clinic:
            print("mouse id {} not in clinic data".format(mouse_id[i]))
            continue
        for j in range(4):
            table[i,j] = clinic.loc[mouse_id[i], name_tags[j]]
    
    # check if there is missing value
    print("There are {} missing values in {}".format(np.isnan(table).sum(), trait))
    
    # for each missing value, fill it in using linear regression
    for i in range(len(table)):
        for j in range(4):
            if np.isnan(table[i,j]):
                # get the index of the mice that has no missing value
                index = np.where(~np.isnan(table[:,j]))[0]
                # get the table level of mice that has no missing value
                table_level = table[index,j]
                # use linear regression to predict the missing value
                table[i,j] = np.poly1d(np.polyfit(index, table_level, 1))(i)
    
    print("After regression, there are {} missing values in {}".format(np.isnan(table).sum(), trait))
    
    # save the table level to csv file
    table = pd.DataFrame(table, index=rna_index, columns=['wk4', 'wk6', 'wk8', 'wk10'])
    table.to_csv("./cleaned/{}.csv".format(trait))

In [9]:
clinic = pd.read_csv('raw_data/(F2)B6_BTBR OB mice_clinic_traits.csv', index_col=0)
clinic

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,4 wk Orbital Eye Bleed,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,6 wk Orbital Eye Bleed,...,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,10 wk Orbital Eye Bleed,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
Mouse ID,SEX,BIRTH DATE,SAC DATE,DATE,WEIGHT (g),BODY LENGTH (cm),GLUCOSE (mg/dl),INSULIN (ng/ml),TRIGLYCERIDE (mg/dl),DATE,...,BODY LENGTH (cm),GLUCOSE (mg/dl),INSULIN (ng/ml),TRIGLYCERIDE (mg/dl),DATE,WEIGHT (g),BODY LENGTH (cm),GLUCOSE (mg/dl),INSULIN (ng/ml),TRIGLYCERIDE (mg/dl)
3001,F,6/2/2005,8/15/2005,6/30/2005,23.1,75,637.351,8.325,132.093,7/14/2005,...,91,617.04975,11.736915,175.073,8/15/2005,58.5,9.5,610.992,4.361187985,249.2594
3002,F,6/2/2005,8/15/2005,,22.8,80,261.842,26.706,112.316,,...,103,256.49315,50.57973997,97.6136,8/15/2005,64.1,10,157.816,28.43120124,114.6497
3003,M,6/3/2005,8/15/2005,,24.1,80,124.065,3.459,54.124,,...,99,274.91575,52.48665679,160.4792,8/15/2005,60.5,10,299.6424,53.65342377,188.3167
3004,M,6/3/2005,8/15/2005,,21,78,254.393,4.513,49.56,,...,92,615.0759,9.000121276,238.687,8/15/2005,53.9,9.3,567.0592,8.061911337,299.4869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3678,M,4/16/2007,6/28/2007,,27.8,80,302.54676,19.5149424,43.37998,,...,91,539.3234,15.54502356,204.26178,,,,,,
3679,F,5/10/2007,,,23,77,110.2602,6.391928419,94.42487,,...,,,,,,,,,,
3680,F,,,,22.9,76,124.935,4.229446967,82.8039,,...,,,,,,,,,,
3681,M,,,,19.1,74,448.4794,2.434944016,109.41967,,...,,,,,,,,,,


In [12]:
glucose = ['Unnamed: 7', 'Unnamed: 13', 'Unnamed: 19', 'Unnamed: 25']
weight = ['Unnamed: 5', 'Unnamed: 11', 'Unnamed: 17', 'Unnamed: 23']
insulin = ['Unnamed: 8', 'Unnamed: 14', 'Unnamed: 20', 'Unnamed: 26']
triglyceride = ['Unnamed: 9', 'Unnamed: 15', 'Unnamed: 21', 'Unnamed: 27']

name_tags = [glucose,weight, insulin, triglyceride]
traits = ['glucose','weight' 'insulin', 'triglyceride']
for i in range(len(name_tags)):
    fill_missing_clinic_traits(name_tags[i], traits[i])

Filling missing values in insulin
The name tags are ['Unnamed: 8', 'Unnamed: 14', 'Unnamed: 20', 'Unnamed: 26']
There are 13 missing values in insulin
After regression, there are 0 missing values in insulin
Filling missing values in triglyceride
The name tags are ['Unnamed: 9', 'Unnamed: 15', 'Unnamed: 21', 'Unnamed: 27']
There are 12 missing values in triglyceride
After regression, there are 0 missing values in triglyceride
