In [67]:
import pandas as pd
import sql_functions as sf

schema = 'organic_africa' 
engine = sf.get_engine()

## Collecting Date of Birth

In order to merge with 'wc_info__WCOrg_1_df'

In [68]:
all_birthdates = {}

all_birthdates['wc_insp_2013__WCOrg_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_insp_2013__WCOrg"')
all_birthdates['wc_insp_2014__WCOrg_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_insp_2014__WCOrg"')
all_birthdates['wc_insp_2015__WCOrg_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_insp_2015__WCOrg"')
all_birthdates['wc_insp_2016__WCOrg_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_insp_2016__WCOrg"')
all_birthdates['wc_insp_2017__WCOrg_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_insp_2017__WCOrg"')

In [69]:
# Set all column names to lowercase and convert date_of_birth in all dfs to datetime

for df in all_birthdates.values():
    df.columns = map(str.lower, df.columns)
    df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])

In [70]:
# Keep only 2 columns

columns_to_keep = ['wc_nr','date_of_birth']
all_birthdates['wc_insp_2013__WCOrg_df'] = all_birthdates['wc_insp_2013__WCOrg_df'][columns_to_keep]
all_birthdates['wc_insp_2014__WCOrg_df'] = all_birthdates['wc_insp_2014__WCOrg_df'][columns_to_keep]
all_birthdates['wc_insp_2015__WCOrg_df'] = all_birthdates['wc_insp_2015__WCOrg_df'][columns_to_keep]
all_birthdates['wc_insp_2016__WCOrg_df'] = all_birthdates['wc_insp_2016__WCOrg_df'][columns_to_keep]
all_birthdates['wc_insp_2017__WCOrg_df'] = all_birthdates['wc_insp_2017__WCOrg_df'][columns_to_keep]

In [71]:
all_birthdates['wc_insp_2013__WCOrg_df']

Unnamed: 0,wc_nr,date_of_birth
0,1775.0,1955-10-05
1,2901.0,NaT
2,2902.0,1995-09-16
3,2903.0,1933-01-01
4,2904.0,1932-09-03
...,...,...
276,2472.0,NaT
277,1369.0,1971-04-24
278,1371.0,1973-03-23
279,1403.0,1952-07-06


#### Concating all birthdate dfs from dict

In [72]:
all_birthdates_df = pd.concat(all_birthdates.values())
all_birthdates_df 

Unnamed: 0,wc_nr,date_of_birth
0,1775.0,1955-10-05
1,2901.0,NaT
2,2902.0,1995-09-16
3,2903.0,1933-01-01
4,2904.0,1932-09-03
...,...,...
5,3965.0,1974-08-05
6,3699.0,1957-06-01
7,4143.0,1963-02-02
8,4144.0,1972-09-01


In [73]:
all_birthdates_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5473 entries, 0 to 9
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   wc_nr          5472 non-null   float64       
 1   date_of_birth  341 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1)
memory usage: 128.3 KB


#### Droping duplicates

In [74]:
# Counting duplicated Wc_Nr to drop them

all_birthdates_df["wc_nr"].duplicated().value_counts()

False    3745
True     1728
Name: wc_nr, dtype: int64

In [75]:
all_birthdates_df = all_birthdates_df.drop_duplicates(subset="wc_nr")

In [76]:
all_birthdates_df.isnull().sum()

wc_nr               1
date_of_birth    3454
dtype: int64

In [77]:
all_birthdates_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3745 entries, 0 to 9
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   wc_nr          3744 non-null   float64       
 1   date_of_birth  291 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1)
memory usage: 87.8 KB


In [78]:
all_birthdates_df["date_of_birth"].isnull().value_counts()

True     3454
False     291
Name: date_of_birth, dtype: int64

In [81]:
table_name = 'all_birthdates'

if engine!=None:
    try:
        all_birthdates_df.to_sql(name=table_name, # Name of SQL table
                                con=engine, # Engine or connection
                                if_exists='replace', # Drop the table before inserting new values 
                                schema=schema, # Use schmea that was defined earlier
                                index=False, # Write DataFrame index as a column
                                chunksize=5000, # Specify the number of rows in each batch to be written at a time
                                method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
     #Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The all_birthdates table was imported successfully.
