In [74]:
import pandas as pd

import sql_functions as sf

In [75]:
schema = 'organic_africa' # UPDATE 'TABLE_SCHEMA' based on schema used in class 
engine = sf.get_engine()

**Importing tables of farmer data as Dataframes**

5 Dfs stored in one Dictionary 

In [140]:
all_farmers = {}

all_farmers['OGInfo__OGMB_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OGInfo__OGMB_1"')
all_farmers['OG_Info__OGOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OG_Info__OGOrg_1"')
all_farmers['wc_info_bayoba_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_info_bayoba_1"')
all_farmers['wc_info__WCOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_info__WCOrg_1"')
all_farmers['OG_Info_reserve__OGOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OG_Info_reserve__OGOrg_1"')


## Preparing imported farmer dataframes

**Preparing farmer dfs to concat them to one big df**

1. Reduce all farmer df to the necessary ~20 columns 
2. Set all column names to lowercase
3. Renaming equal columns with equal column names
4. Checking and converting data types

In [141]:
# Reduce all farmer df to the necessary ~20 columns
all_farmers['OGInfo__OGMB_1_df'] = all_farmers['OGInfo__OGMB_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                   'Species', 'Surname', 'First_names', 'Sex', 'ID_Number',
                                                                   'Date_of_Birth', 'date_contracted', 'Date_dropped_out', 'Reason_dropped_out',
                                                                   'Address', 'OG_Organic_Status', 'OG_Organic_Status_from_Date', 'Fairtrade']]

all_farmers['OG_Info__OGOrg_1_df'] = all_farmers['OG_Info__OGOrg_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number', 'Date_of_Birth',
                                                                       'Date_contracted', 'Date_dropped_out', 'Reason_dropped_out', 
                                                                       'OG_Organic_Status', 'Address', 'OG_Organic_Status_from_Date',
                                                                       'Fairtrade','ward_nr']]

all_farmers['wc_info_bayoba_1_df'] = all_farmers['wc_info_bayoba_1_df'][['ID', 'WC_Nr', 'WC_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number', 
                                                                       'Date_contracted', 'Date_dropped_out',
                                                                       'Reason_dropped_out', 'Address', 'Organic_Status', 
                                                                       'Organic_Status_from_Date', 'Fairtrade', 'Date_of_Birth', 'ward_nr']]

all_farmers['wc_info__WCOrg_1_df'] = all_farmers['wc_info__WCOrg_1_df'][['ID', 'WC_Nr', 'WC_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number',
                                                                       'Date_contracted', 'Reason_dropped_out', 'Address', 'Organic_Status', 
                                                                       'Organic_Status_from_Date', 'Fairtrade', 'Date_dropped_out', 'ward_nr']]

all_farmers['OG_Info_reserve__OGOrg_1_df'] = all_farmers['OG_Info_reserve__OGOrg_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 
                                                                                       'Ward_Nr/Name', 'Species', 'Surname', 'First_names',
                                                                                       'Sex', 'ID_Number', 'Date_of_Birth', 'Date_contracted',
                                                                                       'Date_dropped_out', 'Reason_dropped_out', 'Address',
                                                                                       'OG_Organic_Status', 'OG_Organic_Status_from_Date', 
                                                                                       'Fairtrade', 'ward_nr']]

In [142]:
# Set all column names to lowercase

for df in all_farmers.values():
    #print(df)
    df.columns = map(str.lower, df.columns)

In [143]:
all_farmers['OGInfo__OGMB_1_df'].columns

Index(['id', 'og_nr', 'og_code', 'production_unit', 'area', 'ward_nr/name',
       'species', 'surname', 'first_names', 'sex', 'id_number',
       'date_of_birth', 'date_contracted', 'date_dropped_out',
       'reason_dropped_out', 'address', 'og_organic_status',
       'og_organic_status_from_date', 'fairtrade'],
      dtype='object')

In [144]:
# Renaming equal columns with equal column names
all_farmers['OGInfo__OGMB_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)
all_farmers['OG_Info__OGOrg_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)
all_farmers['OG_Info_reserve__OGOrg_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)

**4. Checking and converting data types**

.info() .dtype() type()

pd.to_datetime()

#### OGInfo__OGMB_1

In [145]:
all_farmers['OGInfo__OGMB_1_df'].info()

# looks good!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8166 entries, 0 to 8165
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        8166 non-null   int64         
 1   og_nr                     8166 non-null   int64         
 2   og_code                   8166 non-null   object        
 3   production_unit           8166 non-null   object        
 4   area                      8166 non-null   object        
 5   ward_nr/name              8166 non-null   int64         
 6   species                   8166 non-null   object        
 7   surname                   8166 non-null   object        
 8   first_names               8166 non-null   object        
 9   sex                       8166 non-null   object        
 10  id_number                 7073 non-null   object        
 11  date_of_birth             657 non-null    datetime64[ns]
 12  date_contracted     

In [146]:
type(all_farmers['OGInfo__OGMB_1_df']['date_contracted'][2])

pandas._libs.tslibs.timestamps.Timestamp

In [147]:
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'][0]

'12/07/2022'

#### OG_Info__OGOrg_1_df

In [148]:
all_farmers['OG_Info__OGOrg_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        1403 non-null   int64 
 1   og_nr                     1403 non-null   int64 
 2   og_code                   1403 non-null   object
 3   production_unit           1403 non-null   object
 4   area                      1403 non-null   object
 5   ward_nr/name              1403 non-null   object
 6   species                   1403 non-null   object
 7   surname                   1403 non-null   object
 8   first_names               1403 non-null   object
 9   sex                       1403 non-null   object
 10  id_number                 1310 non-null   object
 11  date_of_birth             930 non-null    object
 12  date_contracted           1403 non-null   object
 13  date_dropped_out          405 non-null    object
 14  reason_dropped_out      

In [149]:
all_farmers['OG_Info__OGOrg_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_of_birth'])

In [135]:
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

# Out of bounds nanosecond timestamp: 218-08-19 00:00:00 present at position 59

In [134]:
#all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'], errors = 'coerce')

  all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'], errors = 'coerce')


In [130]:
all_farmers['OG_Info__OGOrg_1_df'].loc[all_farmers['OG_Info__OGOrg_1_df']['date_contracted']=='218-08-19']

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,organic_status,address,organic_status_from_date,fairtrade,ward_nr


In [None]:
# Problem with date '19/08/19218' at 741
#all_farmers['OG_Info__OGOrg_1_df'].loc[all_farmers['OG_Info__OGOrg_1_df']['date_contracted']=='19/08/19218']

In [87]:
# Function to pad/fill to short dates like 5/3/18 to 05/03/2018

def str_date_pad(val):
    
    if type(val) is str and len(val) < 10: 

        date_splitted = val.split('/')
        
        if len(date_splitted[1]) < 2 : 
            date_splitted[1] = '0' + date_splitted[1]

        if (len(date_splitted[2]) < 4) and (int(date_splitted[2]) < 24) : 
            date_splitted[2] = '20' + date_splitted[2]
        if len(date_splitted[2]) < 4 and int(date_splitted[2]) > 23 : 
            date_splitted[2] = '19' + date_splitted[2]
    
        val = '/'.join(date_splitted)
        #print(val)
        return val
    
    else:
        return val


In [152]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[741]['date_contracted']

'19/08/19218'

In [151]:
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = all_farmers['OG_Info__OGOrg_1_df']['date_contracted'].apply(str_date_pad)

In [90]:
# replace '19/08/19218' by '19/08/2018' 
#all_farmers['OG_Info__OGOrg_1_df'].iloc[741]['date_contracted'].replace('19/08/19218', '19/08/2018')

#all_farmers['OG_Info__OGOrg_1_df'].iloc[742]['date_contracted']

In [91]:
#all_farmers['OG_Info__OGOrg_1_df'].iloc[744]['date_contracted']

In [92]:
# Ready to convert str to datetime
#all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

In [138]:
all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'])

  all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'])


In [139]:
all_farmers['OG_Info__OGOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        1403 non-null   int64         
 1   og_nr                     1403 non-null   int64         
 2   og_code                   1403 non-null   object        
 3   production_unit           1403 non-null   object        
 4   area                      1403 non-null   object        
 5   ward_nr/name              1403 non-null   object        
 6   species                   1403 non-null   object        
 7   surname                   1403 non-null   object        
 8   first_names               1403 non-null   object        
 9   sex                       1403 non-null   object        
 10  id_number                 1310 non-null   object        
 11  date_of_birth             930 non-null    datetime64[ns]
 12  date_contracted     

#### wc_info_bayoba_1_df

In [95]:
all_farmers['wc_info_bayoba_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10952 entries, 0 to 10951
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        10952 non-null  int64  
 1   wc_nr                     10952 non-null  int64  
 2   wc_code                   10952 non-null  object 
 3   production_unit           10952 non-null  object 
 4   area                      10952 non-null  object 
 5   ward_nr/name              10535 non-null  object 
 6   species                   10952 non-null  object 
 7   surname                   10951 non-null  object 
 8   first_names               10949 non-null  object 
 9   sex                       10952 non-null  object 
 10  id_number                 10248 non-null  object 
 11  date_contracted           10951 non-null  object 
 12  date_dropped_out          1924 non-null   object 
 13  reason_dropped_out        1054 non-null   object 
 14  addres

In [96]:
all_farmers['wc_info_bayoba_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_of_birth'])

In [97]:
all_farmers['wc_info_bayoba_1_df']['date_contracted'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_contracted'])

In [98]:
all_farmers['wc_info_bayoba_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_dropped_out'])

In [99]:
all_farmers['wc_info_bayoba_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10952 entries, 0 to 10951
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        10952 non-null  int64         
 1   wc_nr                     10952 non-null  int64         
 2   wc_code                   10952 non-null  object        
 3   production_unit           10952 non-null  object        
 4   area                      10952 non-null  object        
 5   ward_nr/name              10535 non-null  object        
 6   species                   10952 non-null  object        
 7   surname                   10951 non-null  object        
 8   first_names               10949 non-null  object        
 9   sex                       10952 non-null  object        
 10  id_number                 10248 non-null  object        
 11  date_contracted           10951 non-null  datetime64[ns]
 12  date_dropped_out  

#### wc_info__WCOrg_1_df

In [100]:
all_farmers['wc_info__WCOrg_1_df'].info()

# has no date_of_birth column -> 19 columns
# need to convert date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        565 non-null    int64  
 1   wc_nr                     565 non-null    int64  
 2   wc_code                   565 non-null    object 
 3   production_unit           565 non-null    object 
 4   area                      565 non-null    object 
 5   ward_nr/name              563 non-null    object 
 6   species                   565 non-null    object 
 7   surname                   564 non-null    object 
 8   first_names               564 non-null    object 
 9   sex                       565 non-null    object 
 10  id_number                 472 non-null    object 
 11  date_contracted           558 non-null    object 
 12  reason_dropped_out        142 non-null    object 
 13  address                   530 non-null    object 
 14  organic_st

In [101]:
all_farmers['wc_info__WCOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_contracted'])

In [102]:
all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'])

  all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'])


In [103]:
all_farmers['wc_info__WCOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        565 non-null    int64         
 1   wc_nr                     565 non-null    int64         
 2   wc_code                   565 non-null    object        
 3   production_unit           565 non-null    object        
 4   area                      565 non-null    object        
 5   ward_nr/name              563 non-null    object        
 6   species                   565 non-null    object        
 7   surname                   564 non-null    object        
 8   first_names               564 non-null    object        
 9   sex                       565 non-null    object        
 10  id_number                 472 non-null    object        
 11  date_contracted           558 non-null    datetime64[ns]
 12  reason_dropped_out    

#### OG_Info_reserve__OGOrg_1_df

In [104]:
all_farmers['OG_Info_reserve__OGOrg_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        795 non-null    int64  
 1   og_nr                     795 non-null    int64  
 2   og_code                   795 non-null    object 
 3   production_unit           795 non-null    object 
 4   area                      795 non-null    object 
 5   ward_nr/name              794 non-null    object 
 6   species                   795 non-null    object 
 7   surname                   795 non-null    object 
 8   first_names               795 non-null    object 
 9   sex                       795 non-null    object 
 10  id_number                 734 non-null    object 
 11  date_of_birth             267 non-null    object 
 12  date_contracted           795 non-null    object 
 13  date_dropped_out          255 non-null    object 
 14  reason_dro

In [105]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth'])

In [106]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

  all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])


OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 218-08-19 00:00:00 present at position 59

In [None]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out'])

In [None]:
all_farmers['OG_Info_reserve__OGOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        795 non-null    int64         
 1   og_nr                     795 non-null    int64         
 2   og_code                   795 non-null    object        
 3   production_unit           795 non-null    object        
 4   area                      795 non-null    object        
 5   ward_nr/name              794 non-null    object        
 6   species                   795 non-null    object        
 7   surname                   795 non-null    object        
 8   first_names               795 non-null    object        
 9   sex                       795 non-null    object        
 10  id_number                 734 non-null    object        
 11  date_of_birth             267 non-null    datetime64[ns]
 12  date_contracted       

## Collecting Date of Birth

In order to merge with 'wc_info__WCOrg_1_df'

In [None]:
all_birthdates = {}

all_birthdates['wc_insp_2013__WCOrg_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_insp_2013__WCOrg"')
all_birthdates['wc_insp_2014__WCOrg_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_insp_2014__WCOrg"')
all_birthdates['wc_insp_2015__WCOrg_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_insp_2015__WCOrg"')
all_birthdates['wc_insp_2016__WCOrg_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_insp_2016__WCOrg"')
all_birthdates['wc_insp_2017__WCOrg_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_insp_2017__WCOrg"')

In [None]:
# Reduce df to the necessary ~2 columns
columns_to_keep = ['WC_Nr', 'Date_of_birth']

In [None]:
# Keep only 2 columns, set all column names to lowercase and convert date_of_birth in all dfs to datetime

def clean_df(df):
    df = df.loc[:,columns_to_keep]
    
    df.columns = map(str.lower, df.columns)

    df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
    print(df.info())

    return df


In [None]:
all_birthdates['wc_insp_2013__WCOrg_df']

Unnamed: 0,ID,Area,Full_Name,Date_of_Inspection,WC_Nr,Internal_Inspector,Date_of_birth,Marital_status,Sex,DataInspectionID,...,Compliance_this_year_inspector,Recurrent_sanctions,"If_yes,_which",Conditions_or_explanation,Signature_farmer,Signature_Internal_Inspector,Compliance_this_year_approval_manager,Additional_conditions_or_sanctions,Signature_approval_manager,Certification_proof_2013
0,1,Hwange,Joshua Ngwenya,2013-07-14 00:00:00.000,1775.0,Ngoni,1955-10-05 00:00:00.000,married,male,92.0,...,approve without conditions,no,,,yes,yes,approve without conditions,,yes,
1,2,Hwange,Elis Mate,2013-07-15 00:00:00.000,2901.0,Ngoni,,married,female,190.0,...,approve without conditions,no,,,yes,yes,approve without conditions,,yes,
2,3,Hwange,Marvelous Moyo,2013-07-14 00:00:00.000,2902.0,Shelter,1995-09-16 00:00:00.000,single,male,191.0,...,approve without conditions,no,,,yes,yes,approve without conditions,,yes,
3,4,Hwange,Jessy Moyo,2013-07-15 00:00:00.000,2903.0,Norman,1933-01-01 00:00:00.000,widowed,female,192.0,...,approve without conditions,no,,,yes,yes,approve without conditions,,yes,
4,5,Hwange,Violet Dube,2013-07-15 00:00:00.000,2904.0,Admire,1932-09-03 00:00:00.000,married,female,193.0,...,approve with conditions,no,,sticking to the KAITE standards,yes,yes,approve with conditions,,yes,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,279,Chimanimani,Merita Sithole,2013-08-26 00:00:00.000,2472.0,Norman,,married,female,675.0,...,approve with conditions,no,,To ensure proper storage,yes,yes,approve with conditions,,yes,
277,280,Chipinge,Herbert Sithole,2013-08-29 00:00:00.000,1369.0,Shelter,1971-04-24 00:00:00.000,married,male,683.0,...,approve with conditions,no,,To learn more on organic standards,yes,yes,approve with conditions,,yes,
278,281,Chipinge,Esnath Mashava,2013-08-29 00:00:00.000,1371.0,Shelter,1973-03-23 00:00:00.000,widowed,female,685.0,...,approve without conditions,no,,,yes,yes,approve without conditions,,yes,
279,282,Chipinge,Shongaiyi Nduma,2013-08-29 00:00:00.000,1403.0,Shelter,1952-07-06 00:00:00.000,married,female,687.0,...,approve without conditions,no,,,yes,yes,approve without conditions,,yes,


#### Concating all birthdate dfs from dict

In [None]:
all_birthdates_df = pd.concat(all_birthdates.values())

In [None]:
all_birthdates_df

Unnamed: 0,ID,Area,Full_Name,Date_of_Inspection,WC_Nr,Internal_Inspector,Date_of_birth,Marital_status,Sex,DataInspectionID,...,Overall_comment,HH_under_5_years_female,HH_under_5_years_male,HH_5-17_years_female,HH_5-17_years_male,HH_18-59_years_female,HH_18-59_years_male,HH_over_60_years_female,HH_over_60_years_male,Months_food_lasts
0,1,Hwange,Joshua Ngwenya,2013-07-14 00:00:00.000,1775.0,Ngoni,1955-10-05 00:00:00.000,married,male,92.0,...,,,,,,,,,,
1,2,Hwange,Elis Mate,2013-07-15 00:00:00.000,2901.0,Ngoni,,married,female,190.0,...,,,,,,,,,,
2,3,Hwange,Marvelous Moyo,2013-07-14 00:00:00.000,2902.0,Shelter,1995-09-16 00:00:00.000,single,male,191.0,...,,,,,,,,,,
3,4,Hwange,Jessy Moyo,2013-07-15 00:00:00.000,2903.0,Norman,1933-01-01 00:00:00.000,widowed,female,192.0,...,,,,,,,,,,
4,5,Hwange,Violet Dube,2013-07-15 00:00:00.000,2904.0,Admire,1932-09-03 00:00:00.000,married,female,193.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,7,,,,3965.0,Norman,1974-08-05 00:00:00.000,,,,...,,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,3.0
6,8,,,,3699.0,Norman,1957-06-01 00:00:00.000,,,,...,,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
7,9,,,,4143.0,Tendai,1963-02-02 00:00:00.000,,,,...,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.0
8,10,,,,4144.0,Norman,1972-09-01 00:00:00.000,,,,...,,0.0,1.0,1.0,3.0,1.0,1.0,0.0,0.0,12.0


In [None]:
all_birthdates_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5473 entries, 0 to 9
Columns: 165 entries, ID to Months_food_lasts
dtypes: float64(22), int64(1), object(142)
memory usage: 6.9+ MB


#### Droping duplicates

In [None]:
# Counting duplicated Wc_Nr to drop them

all_birthdates_df["wc_nr"].duplicated().value_counts()

KeyError: 'wc_nr'

In [None]:
all_birthdates_df = all_birthdates_df.drop_duplicates(subset="wc_nr")

In [None]:
all_birthdates_df.isnull().sum()

wc_nr               1
date_of_birth    3454
dtype: int64

In [None]:
all_birthdates_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3745 entries, 0 to 9
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   wc_nr          3744 non-null   float64       
 1   date_of_birth  291 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1)
memory usage: 87.8 KB


In [None]:
all_birthdates_df["date_of_birth"].isnull().value_counts()

True     3454
False     291
Name: date_of_birth, dtype: int64

In [None]:
len(list(set(all_birthdates_df["wc_nr"]) & set(all_farmers['wc_info__WCOrg_1_df'][['wc_nr']].squeeze())))

453

#### Merge all_birthdates_df to wc_info__WCOrg_1_df

In [None]:
all_farmers['wc_info__WCOrg_1_df'] = pd.merge(all_farmers['wc_info__WCOrg_1_df'], all_birthdates_df, how="left", on="wc_nr")

In [None]:
all_farmers['wc_info__WCOrg_1_df']

Unnamed: 0,id,wc_nr,wc_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_contracted,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,date_dropped_out,ward_nr,date_of_birth
0,4942,3962,HWWC,Matabeleland North,Hwange,6 Gondwa,Devils Claw,Moyo,Pios,male,,2014-07-07,,Secretariat Sch Vic Falls,Org,,False,NaT,6.0,NaT
1,5608,3692,HWWC,Matabeleland North,Hwange,6 Gondwa,Devils Claw,Ncube,Mwayani,male,79-124391X79,2015-03-14,,"Mironga School, Vic Falls",Org,,False,NaT,6.0,NaT
2,10630,1773,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Ncube,Engel,female,08-161126N39,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-11-11,False,2017-11-21,3.0,NaT
3,1763,1772,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Dube,Samson,Male,79-031011X79,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-06-19,False,NaT,3.0,1957-10-02
4,1766,1775,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Ngwenya,Joshua,Male,79-011290L79,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-06-19,False,NaT,3.0,1955-10-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,11146,1057,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Zaranyika,Vaina,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT
561,11147,21040,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Chibawana,Violet,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT
562,11148,2555,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Hofisi,Christine,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,,False,2021-01-29,21.0,NaT
563,11149,2066,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu Kola,Mudada,Marjorie,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT


In [None]:
all_farmers['wc_info__WCOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 565 entries, 0 to 564
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        565 non-null    int64         
 1   wc_nr                     565 non-null    int64         
 2   wc_code                   565 non-null    object        
 3   production_unit           565 non-null    object        
 4   area                      565 non-null    object        
 5   ward_nr/name              563 non-null    object        
 6   species                   565 non-null    object        
 7   surname                   564 non-null    object        
 8   first_names               564 non-null    object        
 9   sex                       565 non-null    object        
 10  id_number                 472 non-null    object        
 11  date_contracted           558 non-null    datetime64[ns]
 12  reason_dropped_out    

## Final merge of farmer dfs

Verify shapes of dfs

In [None]:
for name, df in all_farmers.items():
    print(name)
    print(df.shape)

OGInfo__OGMB_1_df
(8166, 19)
OG_Info__OGOrg_1_df
(1403, 20)
wc_info_bayoba_1_df
(10952, 20)
wc_info__WCOrg_1_df
(565, 20)
OG_Info_reserve__OGOrg_1_df
(795, 20)


In [None]:
# ['ward_nr'] column still missing:

all_farmers['OGInfo__OGMB_1_df'][['ward_nr']] = all_farmers['OGInfo__OGMB_1_df'][['ward_nr/name']]
all_farmers['OGInfo__OGMB_1_df'].shape

(8166, 20)

In [None]:
concated_farmers = pd.concat(all_farmers)

In [None]:
concated_farmers.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 21881 entries, ('OGInfo__OGMB_1_df', 0) to ('OG_Info_reserve__OGOrg_1_df', 794)
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        21881 non-null  int64         
 1   og_nr                     10364 non-null  float64       
 2   og_code                   10364 non-null  object        
 3   production_unit           21881 non-null  object        
 4   area                      21881 non-null  object        
 5   ward_nr/name              21461 non-null  object        
 6   species                   21881 non-null  object        
 7   surname                   21879 non-null  object        
 8   first_names               21877 non-null  object        
 9   sex                       21881 non-null  object        
 10  id_number                 19837 non-null  object        
 11  date_of_birth             

In [None]:
concated_farmers

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,0,1545,13137.0,MBOGR,Mashonaland Central,Mbire,15,Rosella,Nyamayaro,Phillip,male,...,NaT,2020-02-05,Own choice,"Mahuwe sch, Box 92, Mbire D",Dropout,2017-12-02 00:00:00.000,False,15.0,,
OGInfo__OGMB_1_df,1,12026,12116.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Funda,Jennie,female,...,2022-04-02,NaT,,"Kasuwo Primary School 346, Mbire",Mabagrown,2022-04-02 00:00:00.000,False,6.0,,
OGInfo__OGMB_1_df,2,9914,23949.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Chingwena,Gladys,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
OGInfo__OGMB_1_df,3,9915,23950.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Marumbe,Tariro,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
OGInfo__OGMB_1_df,4,9916,23951.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Dambudzo,Ketai,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OG_Info_reserve__OGOrg_1_df,790,3546,20023.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Myundia,Thandiwe,female,...,2018-08-19,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,791,3549,20026.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Nhete,Stella,female,...,2018-08-19,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,792,3550,20027.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Musiiwa,Admire,male,...,2018-08-19,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,793,3551,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Masodzi,Norah,female,...,2018-08-19,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,


# Clean and analyze concated_farmers

1. Drop duplicates
2. Count number of records by category
3. Find unique column

In [None]:
# Looking for Duplicates
concated_farmers.duplicated().value_counts()

# 73 duplicates

False    21807
True        74
dtype: int64

In [None]:
df_dup = concated_farmers[concated_farmers.duplicated()]
df_dup

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OG_Info_reserve__OGOrg_1_df,0,110,3463.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Munenge,Vaina,female,...,2018-08-19,NaT,,Nsenga Primary School Box 46 Binga,Org,2016-10-25,True,5.0,,
OG_Info_reserve__OGOrg_1_df,1,151,4084.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Muleya,Belitha,female,...,2018-08-19,NaT,,Nsenga Primary School Box 46 Binga,Org,2014-02-17,True,5.0,,
OG_Info_reserve__OGOrg_1_df,2,152,4085.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Mutale,Mary,female,...,2018-08-19,NaT,,Nsenga Primary School Box 46 Binga,Org,2016-10-25,True,5.0,,
OG_Info_reserve__OGOrg_1_df,3,153,4088.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Mudende,Nolia,female,...,2018-08-19,NaT,,Nsenga Primary School Box 46 Binga,Org,2014-02-17,True,5.0,,
OG_Info_reserve__OGOrg_1_df,7,2143,19485.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Mutale,Opponate,female,...,2018-08-19,2018-11-15,Insufficient documentation,Dongamuse Primary School Box 83 Binga,Dropout,2018-08-10,False,5.0,,
OG_Info_reserve__OGOrg_1_df,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OG_Info_reserve__OGOrg_1_df,685,321,3685.0,BNOG,Matabeleland North,Binga,15 Kariangwe,Rosella,Ngwenya,Oliver,male,...,2018-08-19,NaT,,Mseme Sch Binga,Org,2015-02-26,True,15.0,,
OG_Info_reserve__OGOrg_1_df,697,101,3056.0,BNOG,Matabeleland North,Binga,11 Lubu,Rosella,Munkuli,Saliya,female,...,2018-08-19,NaT,,Lubu Primary School Box 5729 Binga,Org,2013-03-19,True,11.0,,
OG_Info_reserve__OGOrg_1_df,698,102,3059.0,BNOG,Matabeleland North,Binga,11 Lubu,Rosella,Munenge,Rosina 06-013380G06,female,...,2018-08-19,NaT,,Lubu Primary School Box 5729 Binga,Org,2013-03-19,True,11.0,,
OG_Info_reserve__OGOrg_1_df,714,1893,14034.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Mudenda,Sethukile,female,...,2018-08-19,2018-11-14,Insufficient documentation,Dongamuse Pri school,Dropout,2018-08-31,False,5.0,,


In [None]:
ls_dup_og_nr= list(df_dup['og_nr'])

In [None]:
for ognr in ls_dup_og_nr:
    print(concated_farmers[concated_farmers['og_nr'] == ognr])

# duplicates trough OG_Info__OGOrg_1_df und OG_Info_reserve__OGOrg_1_df -> DROP duplicates in allfarmers

                                  id   og_nr og_code     production_unit  \
OG_Info__OGOrg_1_df         162  110  3463.0    BNOG  Matabeleland North   
OG_Info_reserve__OGOrg_1_df 0    110  3463.0    BNOG  Matabeleland North   

                                  area ward_nr/name  species  surname  \
OG_Info__OGOrg_1_df         162  Binga   5 Sinakoma  Rosella  Munenge   
OG_Info_reserve__OGOrg_1_df 0    Binga   5 Sinakoma  Rosella  Munenge   

                                first_names     sex  ... date_contracted  \
OG_Info__OGOrg_1_df         162       Vaina  female  ...      2018-08-19   
OG_Info_reserve__OGOrg_1_df 0         Vaina  female  ...      2018-08-19   

                                date_dropped_out reason_dropped_out  \
OG_Info__OGOrg_1_df         162              NaT               None   
OG_Info_reserve__OGOrg_1_df 0                NaT               None   

                                                            address  \
OG_Info__OGOrg_1_df         162  Nsen

In [None]:
concated_farmers.drop_duplicates(subset=None, keep='first', inplace=True)

In [None]:
concated_farmers.duplicated().value_counts()

False    21807
dtype: int64

In [None]:
concated_farmers[concated_farmers['og_nr'] == 3463.0]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OG_Info__OGOrg_1_df,162,110,3463.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Munenge,Vaina,female,...,2018-08-19,NaT,,Nsenga Primary School Box 46 Binga,Org,2016-10-25,True,5.0,,


In [None]:
concated_farmers[concated_farmers['wc_nr'].duplicated() & ~concated_farmers['wc_nr'].isna()]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code


In [None]:
df = concated_farmers[~concated_farmers['wc_nr'].isna()][['wc_nr']]
df.squeeze().is_unique

True

In [None]:
double_ognr = concated_farmers[concated_farmers['og_nr'].duplicated() & ~concated_farmers['og_nr'].isna()]
double_ognr

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OG_Info__OGOrg_1_df,0,3794,20094.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Muleya,Thandiwe,female,...,2018-08-19,NaT,,"Chininga Primary School 65 Binga, Sinakoma",New,2022-07-12,True,5.0,,
OG_Info__OGOrg_1_df,1,3795,20095.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Mutale,Nomai,female,...,2018-08-19,NaT,,"Chininga Primary School 65 Binga, Sinakoma",New,2022-07-12,True,5.0,,
OG_Info__OGOrg_1_df,2,3796,20096.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Mumpande,Janet,female,...,2018-08-19,NaT,,"Chininga Primary School 65 Binga, Sinakoma",New,2022-07-12,True,5.0,,
OG_Info__OGOrg_1_df,3,3797,20097.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Muyela,Violet,female,...,2018-08-19,NaT,,"Chininga Primary School 65 Binga, Sinakoma",New,2022-07-12,True,5.0,,
OG_Info__OGOrg_1_df,4,3798,20098.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Muleya,Elina,female,...,2018-08-19,NaT,,"Chininga Primary School 65 Binga, Sinakoma",New,2022-07-12,True,5.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OG_Info_reserve__OGOrg_1_df,790,3546,20023.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Myundia,Thandiwe,female,...,2018-08-19,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,791,3549,20026.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Nhete,Stella,female,...,2018-08-19,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,792,3550,20027.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Musiiwa,Admire,male,...,2018-08-19,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,793,3551,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Masodzi,Norah,female,...,2018-08-19,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,


In [None]:
ls_dou_og_nr= list(double_ognr ['og_nr'])
ls_dou_og_nr

df_dou_og_nr = pd.DataFrame(columns = concated_farmers.columns)
print(df_dou_og_nr)



Empty DataFrame
Columns: [id, og_nr, og_code, production_unit, area, ward_nr/name, species, surname, first_names, sex, id_number, date_of_birth, date_contracted, date_dropped_out, reason_dropped_out, address, organic_status, organic_status_from_date, fairtrade, ward_nr, wc_nr, wc_code]
Index: []

[0 rows x 22 columns]


In [None]:
for ognr in ls_dou_og_nr:
    df_row_set = concated_farmers[concated_farmers['og_nr'] == ognr]
    df_row_set.shape[0]
    #print(type(df_row_set))
    
    #for row in df_row_set:
      #  print(row)
        #dc_row = row.to_dic()
        #df_dou_og_nr = df_dou_og_nr.append(dc_row, ignore_index = True)

In [None]:
concated_farmers['organic_status'].value_counts()

Org             9909
Dropout         7678
New             2724
Mabagrown       1113
Reinstated       190
Uncertified      149
dropout           33
Under review      10
Name: organic_status, dtype: int64

In [None]:
concated_farmers[concated_farmers['id'].isna()]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code


In [None]:
df = concated_farmers[~concated_farmers['id_number'].isna()][['id_number']]
df.squeeze().is_unique

False

In [None]:
concated_farmers['id_number'].value_counts()

71-124108M71    3
79-067141T06    3
06-009146E06    3
06-033782H06    3
06-020422L06    3
               ..
71-092138Y71    1
71-083547B71    1
71-171964E71    1
71-022729Z71    1
79-066231E06    1
Name: id_number, Length: 18626, dtype: int64

In [None]:
concated_farmers['area'] = concated_farmers['area'].str.title()

In [None]:
concated_farmers['area'] = concated_farmers['area'].replace('Mt Darwin','Mt Darwin')

In [None]:
concated_farmers.shape

(21807, 22)

In [None]:
concated_farmers['area'].value_counts()

Mbire                   8128
Binga                   2608
Rushinga                1750
Mudzi                   1590
Chimanimani             1369
Buhera                  1152
Chipinge                 977
Mt Darwin                869
Makoni                   535
Beitbridge               529
Kwekwe                   270
Hwange                   260
Uzumbamarambapfungwe     257
Chimanimani Rusitu       196
Mwenezi                  173
Karoi                    163
Triangle                 163
Mutoko                   126
Mberengwa                115
Mt Darwin                 99
Chimanimani Tilbury       95
Chivi                     94
Mushumbi                  89
Beitbridge                84
Matobo                    44
Makoni Mt Shalom          43
Hwange                    29
Name: area, dtype: int64

In [None]:
concated_farmers['area'].unique()
# Correct 'chipinge' first upper

array(['Mbire', 'Triangle', 'Karoi', 'Mushumbi', 'Binga', 'Makoni',
       'Chimanimani', 'Chipinge', 'Chivi', 'Buhera', 'Mt Darwin',
       'Beitbridge', 'Mt Darwin ', 'Mudzi', 'Beitbridge ', 'Rushinga',
       'Kwekwe', 'Mwenezi', 'Uzumbamarambapfungwe', 'Mutoko', 'Mberengwa',
       'Matobo', 'Hwange', 'Hwange ', 'Chimanimani Rusitu',
       'Chimanimani Tilbury', 'Makoni Mt Shalom'], dtype=object)

In [None]:
concated_farmers['production_unit'] = concated_farmers['production_unit'].replace('Mashonaland Centra','Mashonaland Central')

In [None]:
concated_farmers['production_unit'].value_counts()

Mashonaland Central        8216
Northern Eastern region    4691
South Eastern region       3498
Matabeleland North         2898
Southern region             995
Manicaland                  869
Midlands                    270
Masvingo                    163
Mashonaland West            163
Matabeleland South           44
Name: production_unit, dtype: int64

In [None]:
concated_farmers['ward_nr'].unique()

# correct to int

array([15.,  6., 13.,  5.,  7.,  2.,  9., 17., 16.,  3.,  4., 27., 12.,
       11., 10.,  8., 14., 19., 18., 23., 21.,  1., 25., 20., 29., 33.,
       24., 28., 30., 31., 34., 32., nan, 22.])

In [None]:
# 'id_number has not unique and has 3254 duplicates'
#concated_farmers['id_number'].is_unique
concated_farmers['id_number'].duplicated().value_counts()

False    18627
True      3180
Name: id_number, dtype: int64

In [None]:
concated_farmers['species'] = concated_farmers['species'].str.lower()
concated_farmers['sex'] = concated_farmers['sex'].str.lower()

concated_farmers['reason_dropped_out'] = concated_farmers['reason_dropped_out'].str.lower()
concated_farmers['organic_status'] = concated_farmers['organic_status'].str.lower()
concated_farmers['organic_status'] = concated_farmers['organic_status'].str.lower()

In [None]:
concated_farmers['organic_status'].unique()

array(['dropout', 'mabagrown', 'new', 'org', 'reinstated', 'uncertified',
       None, 'under review'], dtype=object)

In [None]:
concated_farmers['species'].unique()

# correct to types rosella, chili, paprika,  

# '5 sinakoma', 'robert' correct to None

array(['rosella', 'roseela', 'rosell', 'rosella, chilli, paprika',
       'ronald', '5 sinakoma', 'paprika', 'rosella, strophantus',
       'chilli, paprika', 'paprika, chilli', 'chilli,paprika', 'chillie',
       'chilli', 'chili', 'chillie/paprica ', 'baobab', 'marula',
       'baobab + ximenia', 'trichillia', 'baobab+kms+ximenia',
       'kalahari melon seed', 'ximenia caffra', 'kms + ximenia',
       'ximenia americana', 'devils claw', 'wild gotu kola'], dtype=object)

In [None]:
concated_farmers['species'] = concated_farmers['species'].replace({'5 sinakoma': 'None', 'ronald':  'None','roseela': 'rosella', 'rosell': 'rosella', 
                                                                   'paprika, chilli': 'chilli, paprika', 'chili': 'chilli',
                                                                    'chilli,paprika':'chilli, paprika', 'chillie': 'chilli', 
                                                                    'chillie/paprica ': 'chilli, paprika',
                                                                    'baobab + ximenia': 'baobab, ximenia', 'baobab+kms+ximenia': 'baobab, kms, ximenia',
                                                                    'kalahari melon seed': 'kalahari, melon, seed', 'kms + ximenia': 'kms, ximenia'
                                                                    })

In [None]:
concated_farmers['wc_nr']

OGInfo__OGMB_1_df            0     NaN
                             1     NaN
                             2     NaN
                             3     NaN
                             4     NaN
                                    ..
OG_Info_reserve__OGOrg_1_df  790   NaN
                             791   NaN
                             792   NaN
                             793   NaN
                             794   NaN
Name: wc_nr, Length: 21807, dtype: float64

In [None]:
# concated_farmers['wc_nr'].astype('int32').dtypes

In [None]:
concated_farmers['wc_nr'].duplicated().value_counts()

False    11518
True     10289
Name: wc_nr, dtype: int64

In [None]:
concated_farmers['ward_nr'].duplicated().value_counts()

True     21773
False       34
Name: ward_nr, dtype: int64

In [None]:
concated_farmers['ward_nr'].is_unique

False

In [None]:
concated_farmers['ward_nr'].unique()

array([15.,  6., 13.,  5.,  7.,  2.,  9., 17., 16.,  3.,  4., 27., 12.,
       11., 10.,  8., 14., 19., 18., 23., 21.,  1., 25., 20., 29., 33.,
       24., 28., 30., 31., 34., 32., nan, 22.])

In [None]:
# concated_farmers['ward_nr'] = concated_farmers['ward_nr'].astype(int)

In [None]:
concated_farmers.iloc[0]

id                                                 1545
og_nr                                           13137.0
og_code                                           MBOGR
production_unit                     Mashonaland Central
area                                              Mbire
ward_nr/name                                         15
species                                         rosella
surname                                       Nyamayaro
first_names                                     Phillip
sex                                                male
id_number                                  47-158929E47
date_of_birth                                       NaT
date_contracted                                     NaT
date_dropped_out                    2020-02-05 00:00:00
reason_dropped_out                           own choice
address                     Mahuwe sch, Box 92, Mbire D
organic_status                                  dropout
organic_status_from_date        2017-12-02 00:00

In [None]:
df = concated_farmers[~concated_farmers['og_nr'].isna()][['og_nr']]
df.squeeze().is_unique

False

In [None]:
concated_farmers[concated_farmers['og_nr'] == 20028.0]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,5506,5614,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"rosella, chilli, paprika",Masodzi,Norah,female,...,2019-02-01,2020-05-07,area closed,"ARDA Mushumbi, Box 210, Mbire",dropout,2019-02-01 00:00:00.000,False,3.0,,
OG_Info__OGOrg_1_df,981,3551,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"rosella, chilli, paprika",Masodzi,Norah,female,...,2018-08-19,2020-01-01,,"ARDA Mushumbi, Box 210, Mbire",dropout,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,793,3551,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"rosella, chilli, paprika",Masodzi,Norah,female,...,2018-08-19,NaT,,"ARDA Mushumbi, Box 210, Mbire",new,2019-02-01,False,3.0,,


In [None]:
concated_farmers = concated_farmers.sort_values('date_contracted')

In [None]:
concated_farmers.drop_duplicates('og_nr',keep='last', inplace = 'True')

In [None]:
concated_farmers[concated_farmers['og_nr'] == 20028.0]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,5506,5614,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"rosella, chilli, paprika",Masodzi,Norah,female,...,2019-02-01,2020-05-07,area closed,"ARDA Mushumbi, Box 210, Mbire",dropout,2019-02-01 00:00:00.000,False,3.0,,


In [None]:
concated_farmers = concated_farmers.sort_index()

In [None]:
concated_farmers[['og_nr']].squeeze().is_unique

True

In [None]:
# concated_farmers has 9094 rows × 22 columns

#### Upload the prepared farmer data

In [None]:
#table_name = 'all_farmers'

#if engine!=None:
 #   try:
  #      concated_farmers.to_sql(name=table_name, # Name of SQL table
   #                             con=engine, # Engine or connection
    #                            if_exists='replace', # Drop the table before inserting new values 
     #                           schema=schema, # Use schmea that was defined earlier
      #                          index=False, # Write DataFrame index as a column
       #                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
        #                        method='multi') # Pass multiple values in a single INSERT clause
#        print(f"The {table_name} table was imported successfully.")
    # Error handling
 #   except (Exception, psycopg2.DatabaseError) as error:
  #      print(error)
   #     engine = None

TRYING
The all_farmers table was imported successfully.
