# <center> Data Preparation </center>

In [1]:
# Constants
DATA_PATH_CHILD  = '../data/raw/G2000_data.csv'
DATA_PATH_PARENT = '../data/raw/EthA_data.csv'
EXPORT_PATH = '../data/processed/processed_merged_df.pkl'
FULL_SEQ = ['AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT']

In [2]:
import pandas as pd 
import numpy as np
import logging 
import pickle

In [3]:
raw_df_child = pd.read_csv(DATA_PATH_CHILD)

In [4]:
# Keeping copy of raw data
df_child = raw_df_child.copy()

In [5]:
raw_df_parent = pd.read_csv(DATA_PATH_PARENT)

In [6]:
df_parent = raw_df_parent.copy()

<center> - Creating a new column in the DataFrame and assigning values based on concatenating the values from two existing columns <center>

<center>** full sequence of alleles **</center>

In [7]:
# Childern full sequence of alleles
df_child['Child_full_DNA_Seq']= df_child['Allele1'] + df_child['Allele2']

In [8]:
df_parent['Parent_full_DNA_Seq']= df_parent['Allele1'] + df_parent['Allele2']

In [9]:
# make copy of G2000 ['ParentM','ParentF','Child_full_DNA_Seq']
df_child_data_copy = df_child[['ParentM','ParentF','Child_full_DNA_Seq']]

In [10]:
# Display information of (df_child_data_copy) G2000 ['ParentM','ParentF','Child_full_DNA_Seq']
df_child_data_copy.sample(10)

Unnamed: 0,ParentM,ParentF,Child_full_DNA_Seq
1075,A9425,A6307,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
11019,A3649,A9479,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
11848,A13804,A34,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
4810,A7775,A11977,CTCCGTCGACGCTTTAGGGACGTAGATGGGAGCTCTGATTCCCGTG...
11809,A6385,A6732,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTTCCGTG...
8547,A12554,A9844,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
10343,A13503,A7604,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
10242,A13802,A10489,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
2992,A12455,A9489,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
844,A8476,A4553,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...


In [11]:
# make copy of EthA ['Name','Parent_full_DNA_Seq']
df_parent_data_copy = df_parent[['Name','Parent_full_DNA_Seq']]

In [14]:
df_parent_data_copy.sample(10)

Unnamed: 0,Name,Parent_full_DNA_Seq
12513,A12513,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
2292,A2292,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
5465,A5465,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGACTCCCGTG...
5659,A5659,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATCCCCGTG...
14400,A14400,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
2756,A2756,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
10754,A10754,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
12458,A12458,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
3984,A3984,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
2453,A2453,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...


## **Merging Operation**

**Merging :**

Merge the dataframes using the `Name` column as the key. The key column for merging is located in the `df_parent` dataframe.



**Keys in `df_child` DataFrame:**

- Left DataFrame Key (`left_on`): `Name`
- Right DataFrame Key (`right_on`): `Father or Mother`



In [15]:
# Merging Once with Father and Once with Mother
parent_child_merge_father = pd.merge(df_parent_data_copy , df_child_data_copy , left_on = 'Name' , right_on= 'ParentF')
parent_child_merge_mother = pd.merge(df_parent_data_copy , df_child_data_copy , left_on = 'Name' , right_on= 'ParentM')

In [16]:
# check each dataframe
parent_child_merge_father.sample(10)

Unnamed: 0,Name,Parent_full_DNA_Seq,ParentM,ParentF,Child_full_DNA_Seq
8562,A5643,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A5036,A5643,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
9201,A6079,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A9347,A6079,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
13777,A9398,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTTCCGTG...,A11105,A9398,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
11242,A7527,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A2061,A7527,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
5579,A3697,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A11826,A3697,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
13744,A9378,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTTATTCCCGTG...,A14299,A9378,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
7447,A4917,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A12398,A4917,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
17405,A11976,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A13395,A11976,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
58,A29,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A14481,A29,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
14159,A9647,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A6825,A9647,CTCCGTCGGCGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...


- ` here when we chech example row 1 we see that Name = 'A5643' and ParentF = 'A5643' and ParentM = 'A5036'`

- ` From this we can say that they are the same person`

- ` From data source they say that Name is unique identifier for a person`

- ` Finally, we conclude that Name is Father of the child ` 

In [17]:
parent_child_merge_mother.sample(10)

Unnamed: 0,Name,Parent_full_DNA_Seq,ParentM,ParentF,Child_full_DNA_Seq
8375,A5739,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A5739,A6087,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
19693,A13497,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A13497,A7654,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
15817,A10764,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A10764,A329,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
18754,A12811,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A12811,A4669,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
16804,A11434,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGACTCCCGTG...,A11434,A5071,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGACTCCCGTG...
20535,A14110,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCTGTG...,A14110,A13922,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
5085,A3306,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A3306,A10288,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
6087,A4015,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A4015,A623,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
18225,A12454,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTTATTCCCGTG...,A12454,A10410,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTTATTCCCGTG...
15621,A10597,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,A10597,A10637,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...
