In [898]:
import pandas as pd
import sql_functions as sf

In [899]:
schema = 'organic_africa' # UPDATE 'TABLE_SCHEMA' based on schema used in class 
engine = sf.get_engine()

**Importing tables of farmer data as Dataframes**

5 Dfs stored in one Dictionary 

In [900]:
all_farmers = {}

all_farmers['OGInfo__OGMB_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OGInfo__OGMB_1"')
all_farmers['OG_Info__OGOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OG_Info__OGOrg_1"')
all_farmers['wc_info_bayoba_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_info_bayoba_1"')
all_farmers['wc_info__WCOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_info__WCOrg_1"')
all_farmers['OG_Info_reserve__OGOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OG_Info_reserve__OGOrg_1"')


## Preparing imported farmer dataframes

**Preparing farmer dfs to concat them to one big df**

1. Reduce all farmer df to the necessary ~20 columns 
2. Set all column names to lowercase
3. Renaming equal columns with equal column names
4. Checking and converting data types

In [901]:
# Reduce all farmer df to the necessary ~20 columns
all_farmers['OGInfo__OGMB_1_df'] = all_farmers['OGInfo__OGMB_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                   'Species', 'Surname', 'First_names', 'Sex', 'ID_Number',
                                                                   'Date_of_Birth', 'date_contracted', 'Date_dropped_out', 'Reason_dropped_out',
                                                                   'Address', 'OG_Organic_Status', 'OG_Organic_Status_from_Date', 'Fairtrade']]

all_farmers['OG_Info__OGOrg_1_df'] = all_farmers['OG_Info__OGOrg_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number', 'Date_of_Birth',
                                                                       'Date_contracted', 'Date_dropped_out', 'Reason_dropped_out', 
                                                                       'OG_Organic_Status', 'Address', 'OG_Organic_Status_from_Date',
                                                                       'Fairtrade','ward_nr']]

all_farmers['wc_info_bayoba_1_df'] = all_farmers['wc_info_bayoba_1_df'][['ID', 'WC_Nr', 'WC_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number', 
                                                                       'Date_contracted', 'Date_dropped_out',
                                                                       'Reason_dropped_out', 'Address', 'Organic_Status', 
                                                                       'Organic_Status_from_Date', 'Fairtrade', 'Date_of_Birth', 'ward_nr']]

all_farmers['wc_info__WCOrg_1_df'] = all_farmers['wc_info__WCOrg_1_df'][['ID', 'WC_Nr', 'WC_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number',
                                                                       'Date_contracted', 'Reason_dropped_out', 'Address', 'Organic_Status', 
                                                                       'Organic_Status_from_Date', 'Fairtrade', 'Date_dropped_out', 'ward_nr']]

all_farmers['OG_Info_reserve__OGOrg_1_df'] = all_farmers['OG_Info_reserve__OGOrg_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 
                                                                                       'Ward_Nr/Name', 'Species', 'Surname', 'First_names',
                                                                                       'Sex', 'ID_Number', 'Date_of_Birth', 'Date_contracted',
                                                                                       'Date_dropped_out', 'Reason_dropped_out', 'Address',
                                                                                       'OG_Organic_Status', 'OG_Organic_Status_from_Date', 
                                                                                       'Fairtrade', 'ward_nr']]

In [902]:
# Set all column names to lowercase

for df in all_farmers.values():
    #print(df)
    df.columns = map(str.lower, df.columns)

In [903]:
all_farmers['OGInfo__OGMB_1_df'].columns

Index(['id', 'og_nr', 'og_code', 'production_unit', 'area', 'ward_nr/name',
       'species', 'surname', 'first_names', 'sex', 'id_number',
       'date_of_birth', 'date_contracted', 'date_dropped_out',
       'reason_dropped_out', 'address', 'og_organic_status',
       'og_organic_status_from_date', 'fairtrade'],
      dtype='object')

In [904]:
# Renaming equal columns with equal column names
all_farmers['OGInfo__OGMB_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)
all_farmers['OG_Info__OGOrg_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)
all_farmers['OG_Info_reserve__OGOrg_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)

**4. Checking and converting data types**

.info() .dtype() type()

pd.to_datetime()

#### OGInfo__OGMB_1

In [905]:
all_farmers['OGInfo__OGMB_1_df'].info()

# looks good!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8166 entries, 0 to 8165
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        8166 non-null   int64         
 1   og_nr                     8166 non-null   int64         
 2   og_code                   8166 non-null   object        
 3   production_unit           8166 non-null   object        
 4   area                      8166 non-null   object        
 5   ward_nr/name              8166 non-null   int64         
 6   species                   8166 non-null   object        
 7   surname                   8166 non-null   object        
 8   first_names               8166 non-null   object        
 9   sex                       8166 non-null   object        
 10  id_number                 7073 non-null   object        
 11  date_of_birth             657 non-null    datetime64[ns]
 12  date_contracted     

In [906]:
type(all_farmers['OGInfo__OGMB_1_df']['date_contracted'][2])

pandas._libs.tslibs.timestamps.Timestamp

In [907]:
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'][0]

'12/07/2022'

#### OG_Info__OGOrg_1_df

In [908]:
all_farmers['OG_Info__OGOrg_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        1403 non-null   int64 
 1   og_nr                     1403 non-null   int64 
 2   og_code                   1403 non-null   object
 3   production_unit           1403 non-null   object
 4   area                      1403 non-null   object
 5   ward_nr/name              1403 non-null   object
 6   species                   1403 non-null   object
 7   surname                   1403 non-null   object
 8   first_names               1403 non-null   object
 9   sex                       1403 non-null   object
 10  id_number                 1310 non-null   object
 11  date_of_birth             930 non-null    object
 12  date_contracted           1403 non-null   object
 13  date_dropped_out          405 non-null    object
 14  reason_dropped_out      

In [909]:
# converted date_of_birth
all_farmers['OG_Info__OGOrg_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_of_birth'])

Converting 'date_contracted' to datetime

In [910]:
# all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

# Out of bounds nanosecond timestamp: 218-08-19 00:00:00 present at position 59

# all_farmers['OG_Info__OGOrg_1_df'].iloc[59,]
# all_farmers['OG_Info__OGOrg_1_df'].loc[all_farmers['OG_Info__OGOrg_1_df']['date_contracted']=='218-08-19']

# Did not found 218-08-19 at index 59 but the seems to get corrected in the next steps

In [911]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[59]['date_contracted']

'31/8/18'

In [912]:
all_farmers['OG_Info__OGOrg_1_df'].replace('218-08-19', '2018-08-19', inplace=True)


In [913]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[50:60,12]

50       31/8/18
51       31/8/18
52    03/12/2012
53    03/12/2012
54       31/8/18
55       31/8/18
56    06/12/2012
57      24/02/23
58    06/12/2012
59       31/8/18
Name: date_contracted, dtype: object

Solving date_contracted errors with padding strings

In [914]:
# Function to pad/fill to short dates like 5/3/18 to 05/03/2018

def str_date_pad(val):
    
    if type(val) is str and len(val) < 10: 

        date_splitted = val.split('/')
        
        if len(date_splitted[1]) < 2 : 
            date_splitted[1] = '0' + date_splitted[1]

        if (len(date_splitted[2]) < 4) and (int(date_splitted[2]) < 24) : 
            date_splitted[2] = '20' + date_splitted[2]
        if len(date_splitted[2]) < 4 and int(date_splitted[2]) > 23 : 
            date_splitted[2] = '19' + date_splitted[2]
    
        val = '/'.join(date_splitted)
        #print(val)
        return val
    
    else:
        return val


In [915]:
# Apply function to df
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = all_farmers['OG_Info__OGOrg_1_df']['date_contracted'].apply(str_date_pad)

In [916]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[50:60,12]

50    31/08/2018
51    31/08/2018
52    03/12/2012
53    03/12/2012
54    31/08/2018
55    31/08/2018
56    06/12/2012
57    24/02/2023
58    06/12/2012
59    31/08/2018
Name: date_contracted, dtype: object

In [917]:
# Now after padding convert str to datetime gives still problem !
# all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

In [918]:
# Problem with date '19/08/19218' at 54 BUT its in position/row_index 741
all_farmers['OG_Info__OGOrg_1_df'].loc[all_farmers['OG_Info__OGOrg_1_df']['date_contracted']=='19/08/19218']

# cause year '218' padded to '19218'

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,organic_status,address,organic_status_from_date,fairtrade,ward_nr
741,2140,19482,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Mpande,Josephine,female,06-010710E06,1968-10-01,19/08/19218,,,Org,Dongamuse Primary School Box 83 Binga,2018-08-19,True,5


In [919]:
# Solve through replacing '19/08/19218' by '19/08/2018' and dropping '01/00/2022'
all_farmers['OG_Info__OGOrg_1_df'].replace('19/08/19218', '19/08/2018', inplace=True)
all_farmers['OG_Info__OGOrg_1_df'].replace('01/00/2022', None, inplace=True)

In [920]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[741]['date_contracted']

'19/08/2018'

In [921]:
# Ready to convert str to datetime
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

  all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])


In [922]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[65]['date_contracted']

Timestamp('2012-06-12 00:00:00')

In [923]:
all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'])

  all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'])


In [924]:
all_farmers['OG_Info__OGOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        1403 non-null   int64         
 1   og_nr                     1403 non-null   int64         
 2   og_code                   1403 non-null   object        
 3   production_unit           1403 non-null   object        
 4   area                      1403 non-null   object        
 5   ward_nr/name              1403 non-null   object        
 6   species                   1403 non-null   object        
 7   surname                   1403 non-null   object        
 8   first_names               1403 non-null   object        
 9   sex                       1403 non-null   object        
 10  id_number                 1310 non-null   object        
 11  date_of_birth             930 non-null    datetime64[ns]
 12  date_contracted     

#### wc_info_bayoba_1_df

In [925]:
all_farmers['wc_info_bayoba_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10952 entries, 0 to 10951
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        10952 non-null  int64  
 1   wc_nr                     10952 non-null  int64  
 2   wc_code                   10952 non-null  object 
 3   production_unit           10952 non-null  object 
 4   area                      10952 non-null  object 
 5   ward_nr/name              10535 non-null  object 
 6   species                   10952 non-null  object 
 7   surname                   10951 non-null  object 
 8   first_names               10949 non-null  object 
 9   sex                       10952 non-null  object 
 10  id_number                 10248 non-null  object 
 11  date_contracted           10951 non-null  object 
 12  date_dropped_out          1924 non-null   object 
 13  reason_dropped_out        1054 non-null   object 
 14  addres

In [926]:
all_farmers['wc_info_bayoba_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_of_birth'])

In [927]:
all_farmers['wc_info_bayoba_1_df']['date_contracted'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_contracted'])

In [928]:
all_farmers['wc_info_bayoba_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_dropped_out'])

In [929]:
all_farmers['wc_info_bayoba_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10952 entries, 0 to 10951
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        10952 non-null  int64         
 1   wc_nr                     10952 non-null  int64         
 2   wc_code                   10952 non-null  object        
 3   production_unit           10952 non-null  object        
 4   area                      10952 non-null  object        
 5   ward_nr/name              10535 non-null  object        
 6   species                   10952 non-null  object        
 7   surname                   10951 non-null  object        
 8   first_names               10949 non-null  object        
 9   sex                       10952 non-null  object        
 10  id_number                 10248 non-null  object        
 11  date_contracted           10951 non-null  datetime64[ns]
 12  date_dropped_out  

#### wc_info__WCOrg_1_df

In [930]:
all_farmers['wc_info__WCOrg_1_df'].info()

# has no date_of_birth column -> 19 columns
# need to convert date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        565 non-null    int64  
 1   wc_nr                     565 non-null    int64  
 2   wc_code                   565 non-null    object 
 3   production_unit           565 non-null    object 
 4   area                      565 non-null    object 
 5   ward_nr/name              563 non-null    object 
 6   species                   565 non-null    object 
 7   surname                   564 non-null    object 
 8   first_names               564 non-null    object 
 9   sex                       565 non-null    object 
 10  id_number                 472 non-null    object 
 11  date_contracted           558 non-null    object 
 12  reason_dropped_out        142 non-null    object 
 13  address                   530 non-null    object 
 14  organic_st

In [931]:
all_farmers['wc_info__WCOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_contracted'])

In [932]:
all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'])

  all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'])


In [933]:
all_farmers['wc_info__WCOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        565 non-null    int64         
 1   wc_nr                     565 non-null    int64         
 2   wc_code                   565 non-null    object        
 3   production_unit           565 non-null    object        
 4   area                      565 non-null    object        
 5   ward_nr/name              563 non-null    object        
 6   species                   565 non-null    object        
 7   surname                   564 non-null    object        
 8   first_names               564 non-null    object        
 9   sex                       565 non-null    object        
 10  id_number                 472 non-null    object        
 11  date_contracted           558 non-null    datetime64[ns]
 12  reason_dropped_out    

#### OG_Info_reserve__OGOrg_1_df

In [934]:
all_farmers['OG_Info_reserve__OGOrg_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        795 non-null    int64  
 1   og_nr                     795 non-null    int64  
 2   og_code                   795 non-null    object 
 3   production_unit           795 non-null    object 
 4   area                      795 non-null    object 
 5   ward_nr/name              794 non-null    object 
 6   species                   795 non-null    object 
 7   surname                   795 non-null    object 
 8   first_names               795 non-null    object 
 9   sex                       795 non-null    object 
 10  id_number                 734 non-null    object 
 11  date_of_birth             267 non-null    object 
 12  date_contracted           795 non-null    object 
 13  date_dropped_out          255 non-null    object 
 14  reason_dro

In [935]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth'])

In [936]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

In [937]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out'])

In [938]:
all_farmers['OG_Info_reserve__OGOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        795 non-null    int64         
 1   og_nr                     795 non-null    int64         
 2   og_code                   795 non-null    object        
 3   production_unit           795 non-null    object        
 4   area                      795 non-null    object        
 5   ward_nr/name              794 non-null    object        
 6   species                   795 non-null    object        
 7   surname                   795 non-null    object        
 8   first_names               795 non-null    object        
 9   sex                       795 non-null    object        
 10  id_number                 734 non-null    object        
 11  date_of_birth             267 non-null    datetime64[ns]
 12  date_contracted       

### Adding Birthdates to wc_info_WCOrg_1_df

In [939]:
all_birthdates_df = sf.get_dataframe(f'SELECT * FROM {schema}."all_birthdates"')

In [940]:
len(list(set(all_birthdates_df["wc_nr"]) & set(all_farmers['wc_info__WCOrg_1_df'][['wc_nr']].squeeze())))

453

#### Merge all_birthdates_df to wc_info__WCOrg_1_df

In [941]:
all_farmers['wc_info__WCOrg_1_df'] = pd.merge(all_farmers['wc_info__WCOrg_1_df'], all_birthdates_df, how="left", on="wc_nr")

In [942]:
all_farmers['wc_info__WCOrg_1_df']

Unnamed: 0,id,wc_nr,wc_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_contracted,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,date_dropped_out,ward_nr,date_of_birth
0,4942,3962,HWWC,Matabeleland North,Hwange,6 Gondwa,Devils Claw,Moyo,Pios,male,,2014-07-07,,Secretariat Sch Vic Falls,Org,,False,NaT,6.0,NaT
1,5608,3692,HWWC,Matabeleland North,Hwange,6 Gondwa,Devils Claw,Ncube,Mwayani,male,79-124391X79,2015-03-14,,"Mironga School, Vic Falls",Org,,False,NaT,6.0,NaT
2,10630,1773,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Ncube,Engel,female,08-161126N39,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-11-11,False,2017-11-21,3.0,NaT
3,1763,1772,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Dube,Samson,Male,79-031011X79,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-06-19,False,NaT,3.0,1957-10-02
4,1766,1775,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Ngwenya,Joshua,Male,79-011290L79,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-06-19,False,NaT,3.0,1955-10-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,11146,1057,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Zaranyika,Vaina,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT
561,11147,21040,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Chibawana,Violet,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT
562,11148,2555,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Hofisi,Christine,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,,False,2021-01-29,21.0,NaT
563,11149,2066,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu Kola,Mudada,Marjorie,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT


In [943]:
all_farmers['wc_info__WCOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 565 entries, 0 to 564
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        565 non-null    int64         
 1   wc_nr                     565 non-null    int64         
 2   wc_code                   565 non-null    object        
 3   production_unit           565 non-null    object        
 4   area                      565 non-null    object        
 5   ward_nr/name              563 non-null    object        
 6   species                   565 non-null    object        
 7   surname                   564 non-null    object        
 8   first_names               564 non-null    object        
 9   sex                       565 non-null    object        
 10  id_number                 472 non-null    object        
 11  date_contracted           558 non-null    datetime64[ns]
 12  reason_dropped_out    

## Final merge of farmer dfs to concated_farmers

Verify shapes of dfs

In [944]:
for name, df in all_farmers.items():
    print(name)
    print(df.shape)

OGInfo__OGMB_1_df
(8166, 19)
OG_Info__OGOrg_1_df
(1403, 20)
wc_info_bayoba_1_df
(10952, 20)
wc_info__WCOrg_1_df
(565, 20)
OG_Info_reserve__OGOrg_1_df
(795, 20)


In [945]:
# ['ward_nr'] column still missing:

all_farmers['OGInfo__OGMB_1_df'][['ward_nr']] = all_farmers['OGInfo__OGMB_1_df'][['ward_nr/name']]
all_farmers['OGInfo__OGMB_1_df'].shape

(8166, 20)

In [946]:
concated_farmers = pd.concat(all_farmers)

In [947]:
concated_farmers.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 21881 entries, ('OGInfo__OGMB_1_df', 0) to ('OG_Info_reserve__OGOrg_1_df', 794)
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        21881 non-null  int64         
 1   og_nr                     10364 non-null  float64       
 2   og_code                   10364 non-null  object        
 3   production_unit           21881 non-null  object        
 4   area                      21881 non-null  object        
 5   ward_nr/name              21461 non-null  object        
 6   species                   21881 non-null  object        
 7   surname                   21879 non-null  object        
 8   first_names               21877 non-null  object        
 9   sex                       21881 non-null  object        
 10  id_number                 19837 non-null  object        
 11  date_of_birth             

In [948]:
concated_farmers

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,0,1545,13137.0,MBOGR,Mashonaland Central,Mbire,15,Rosella,Nyamayaro,Phillip,male,...,NaT,2020-02-05,Own choice,"Mahuwe sch, Box 92, Mbire D",Dropout,2017-12-02 00:00:00.000,False,15.0,,
OGInfo__OGMB_1_df,1,12026,12116.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Funda,Jennie,female,...,2022-04-02,NaT,,"Kasuwo Primary School 346, Mbire",Mabagrown,2022-04-02 00:00:00.000,False,6.0,,
OGInfo__OGMB_1_df,2,9914,23949.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Chingwena,Gladys,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
OGInfo__OGMB_1_df,3,9915,23950.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Marumbe,Tariro,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
OGInfo__OGMB_1_df,4,9916,23951.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Dambudzo,Ketai,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OG_Info_reserve__OGOrg_1_df,790,3546,20023.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Myundia,Thandiwe,female,...,2018-08-18,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,791,3549,20026.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Nhete,Stella,female,...,2018-08-18,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,792,3550,20027.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Musiiwa,Admire,male,...,2019-07-31,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,793,3551,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Masodzi,Norah,female,...,2019-07-31,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,


In [949]:
# 11517 wc_nr
concated_farmers[~concated_farmers['wc_nr'].isna()]['wc_nr']

wc_info_bayoba_1_df  0      21146.0
                     1      21833.0
                     2      21147.0
                     3      22986.0
                     4      21148.0
                             ...   
wc_info__WCOrg_1_df  560     1057.0
                     561    21040.0
                     562     2555.0
                     563     2066.0
                     564     1069.0
Name: wc_nr, Length: 11517, dtype: float64

# Clean and analyze concated_farmers

1. Drop duplicates (+ wc_nr and og_nr)
2. Count number of records by category
3. Clean column by column

In [950]:
# Looking for Duplicates
concated_farmers.duplicated().value_counts()

# 73 duplicates

False    21868
True        13
dtype: int64

In [951]:
# All duplicates in OG_Info_reserve__OGOrg_1_df
df_dup = concated_farmers[concated_farmers.duplicated()].sort_values('id')
df_dup

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OG_Info_reserve__OGOrg_1_df,43,47,2869.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Muchimba,Magret,female,...,2012-06-12,NaT,,Nsenga Primary School Box 46 Binga,Org,2016-10-25,True,5.0,,
OG_Info_reserve__OGOrg_1_df,47,51,2875.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Muleya,Maria 06-020301E06,female,...,2012-06-12,NaT,,Nsenga Primary School Box 46 Binga,Org,2016-10-25,True,5.0,,
OG_Info_reserve__OGOrg_1_df,189,133,3788.0,BNOG,Matabeleland North,Binga,15 Kariangwe,Rosella,Mumpande,Julia,female,...,2015-08-17,NaT,,Manseme School Binga,Org,2016-10-25,True,15.0,,
OG_Info_reserve__OGOrg_1_df,192,136,3793.0,BNOG,Matabeleland North,Binga,15 Kariangwe,Rosella,Muleya,Agnes,female,...,2015-08-17,NaT,,Manseme School Binga,Org,2015-08-17,True,15.0,,
OG_Info_reserve__OGOrg_1_df,455,1063,16138.0,MAOG,Manicaland,Makoni,21,Chillie,Magondwa,Patience,Female,...,2017-08-31,2017-10-19,High risk,"Chirimutsitu Primary P Bag 8147, Rusape",Dropout,2017-10-19,False,21.0,,
OG_Info_reserve__OGOrg_1_df,487,1097,16172.0,MAOG,Manicaland,Makoni,18,Chillie,Maponde,Kelvin,male,...,2017-08-31,2018-10-18,Not inspected,"Chirimutsitu Primary P Bag 8147, Rusape",Dropout,2017-08-31,False,18.0,,
OG_Info_reserve__OGOrg_1_df,545,1157,16232.0,MAOG,Manicaland,Makoni,21,Chillie,Nyamuva,Anna,Female,...,2017-08-31,2017-11-11,Not inspected,"Chirimutsitu Primary P Bag 8147, Rusape",Dropout,2017-11-11,False,21.0,,
OG_Info_reserve__OGOrg_1_df,569,1181,16256.0,MAOG,Manicaland,Makoni,19,Chillie,Tekere,Marry,Female,...,2017-08-31,2017-11-11,Not inspected,"Chirimutsitu Primary P Bag 8147, Rusape",Dropout,2017-11-11,False,19.0,,
OG_Info_reserve__OGOrg_1_df,348,1972,14109.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Siankali,Nolia,female,...,2018-08-31,2018-11-14,Insufficient documentation,Nsenga Primary School,Dropout,2018-08-31,False,5.0,,
OG_Info_reserve__OGOrg_1_df,360,1983,14120.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Mwembe,Jairos,female,...,2018-08-31,2018-11-14,Insufficient documentation,Nsenga Primary School,Dropout,2018-08-31,False,5.0,,


In [952]:
ls_dup_og_nr= list(df_dup['og_nr'])

In [953]:
for ognr in ls_dup_og_nr:
    print(concated_farmers[concated_farmers['og_nr'] == ognr])

# duplicates trough OG_Info__OGOrg_1_df und OG_Info_reserve__OGOrg_1_df -> DROP duplicates in allfarmers

                                id   og_nr og_code     production_unit   area  \
OG_Info__OGOrg_1_df         70  47  2869.0    BNOG  Matabeleland North  Binga   
OG_Info_reserve__OGOrg_1_df 43  47  2869.0    BNOG  Matabeleland North  Binga   

                               ward_nr/name  species   surname first_names  \
OG_Info__OGOrg_1_df         70   5 Sinakoma  Rosella  Muchimba      Magret   
OG_Info_reserve__OGOrg_1_df 43   5 Sinakoma  Rosella  Muchimba      Magret   

                                   sex  ... date_contracted date_dropped_out  \
OG_Info__OGOrg_1_df         70  female  ...      2012-06-12              NaT   
OG_Info_reserve__OGOrg_1_df 43  female  ...      2012-06-12              NaT   

                               reason_dropped_out  \
OG_Info__OGOrg_1_df         70               None   
OG_Info_reserve__OGOrg_1_df 43               None   

                                                           address  \
OG_Info__OGOrg_1_df         70  Nsenga Primary Sch

In [954]:
# Droping duplicates
concated_farmers.drop_duplicates(subset=None, keep='first', inplace=True)

In [955]:
concated_farmers.duplicated().value_counts()

False    21868
dtype: int64

### Dropping duplicates of wc_nr

In [956]:
# 11517 wc_nr
wc_nr_df = concated_farmers[~concated_farmers['wc_nr'].isna()][['wc_nr']]
wc_nr_df

Unnamed: 0,Unnamed: 1,wc_nr
wc_info_bayoba_1_df,0,21146.0
wc_info_bayoba_1_df,1,21833.0
wc_info_bayoba_1_df,2,21147.0
wc_info_bayoba_1_df,3,22986.0
wc_info_bayoba_1_df,4,21148.0
...,...,...
wc_info__WCOrg_1_df,560,1057.0
wc_info__WCOrg_1_df,561,21040.0
wc_info__WCOrg_1_df,562,2555.0
wc_info__WCOrg_1_df,563,2066.0


#### wc_nr is unique!

In [957]:
wc_nr_df.squeeze().is_unique

True

In [958]:
# 21868 rows have og_nr or wc_nr!
concated_farmers[~concated_farmers['og_nr'].isna() | ~concated_farmers['wc_nr'].isna()].shape

(21868, 22)

### Dropping duplicates of og_nr

In [959]:
# 10351 
og_nr_df = concated_farmers[~concated_farmers['og_nr'].isna()][['og_nr', 'date_contracted']]
og_nr_df

Unnamed: 0,Unnamed: 1,og_nr,date_contracted
OGInfo__OGMB_1_df,0,13137.0,NaT
OGInfo__OGMB_1_df,1,12116.0,2022-04-02
OGInfo__OGMB_1_df,2,23949.0,2020-03-23
OGInfo__OGMB_1_df,3,23950.0,2020-03-23
OGInfo__OGMB_1_df,4,23951.0,2020-03-23
...,...,...,...
OG_Info_reserve__OGOrg_1_df,790,20023.0,2018-08-18
OG_Info_reserve__OGOrg_1_df,791,20026.0,2018-08-18
OG_Info_reserve__OGOrg_1_df,792,20027.0,2019-07-31
OG_Info_reserve__OGOrg_1_df,793,20028.0,2019-07-31


In [960]:
# og_nr NOT unique
og_nr_df['og_nr'].squeeze().is_unique

False

In [961]:
concated_farmers.sort_values('date_contracted', ascending=False, inplace=True)

In [962]:
concated_farmers[concated_farmers['og_nr'] == 20028.0][['og_nr', 'date_contracted']]

Unnamed: 0,Unnamed: 1,og_nr,date_contracted
OG_Info_reserve__OGOrg_1_df,793,20028.0,2019-07-31
OGInfo__OGMB_1_df,5506,20028.0,2019-02-01
OG_Info__OGOrg_1_df,981,20028.0,2019-01-02


In [963]:
# 1258 duplicates in og_nr
concated_farmers['og_nr'].duplicated().value_counts()

True     12774
False     9094
Name: og_nr, dtype: int64

In [974]:
#concated_farmers[concated_farmers['og_nr'].isna()] 

In [973]:
concated_farmers = concated_farmers[((concated_farmers.drop_duplicates('og_nr', keep='first')) | concated_farmers['og_nr'].isna())]

ValueError: cannot join with no overlapping index names

In [None]:
concated_farmers[concated_farmers['og_nr'] == 20028.0][['og_nr', 'date_contracted']]

Unnamed: 0,Unnamed: 1,og_nr,date_contracted
OG_Info_reserve__OGOrg_1_df,793,20028.0,2019-07-31


In [None]:
concated_farmers['og_nr'].duplicated().value_counts()

False    9094
Name: og_nr, dtype: int64

In [None]:
concated_farmers.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9094 entries, ('OGInfo__OGMB_1_df', 145) to ('OGInfo__OGMB_1_df', 6528)
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        9094 non-null   int64         
 1   og_nr                     9093 non-null   float64       
 2   og_code                   9093 non-null   object        
 3   production_unit           9094 non-null   object        
 4   area                      9094 non-null   object        
 5   ward_nr/name              9092 non-null   object        
 6   species                   9094 non-null   object        
 7   surname                   9094 non-null   object        
 8   first_names               9094 non-null   object        
 9   sex                       9094 non-null   object        
 10  id_number                 7938 non-null   object        
 11  date_of_birth             1515 non

In [None]:
# Sorting back with index
concated_farmers.sort_index()

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,0,1545,13137.0,MBOGR,Mashonaland Central,Mbire,15,Rosella,Nyamayaro,Phillip,male,...,NaT,2020-02-05,Own choice,"Mahuwe sch, Box 92, Mbire D",Dropout,2017-12-02 00:00:00.000,False,15.0,,
OGInfo__OGMB_1_df,1,12026,12116.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Funda,Jennie,female,...,2022-04-02,NaT,,"Kasuwo Primary School 346, Mbire",Mabagrown,2022-04-02 00:00:00.000,False,6.0,,
OGInfo__OGMB_1_df,2,9914,23949.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Chingwena,Gladys,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
OGInfo__OGMB_1_df,3,9915,23950.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Marumbe,Tariro,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
OGInfo__OGMB_1_df,4,9916,23951.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Dambudzo,Ketai,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OG_Info_reserve__OGOrg_1_df,774,3529,20006.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Chatambudzika,Silver,male,...,2022-11-06,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-07,False,3.0,,
OG_Info_reserve__OGOrg_1_df,785,3541,20018.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Chiombera,Masimba,male,...,2022-10-06,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,792,3550,20027.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Musiiwa,Admire,male,...,2019-07-31,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,793,3551,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Masodzi,Norah,female,...,2019-07-31,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,


In [None]:
error

NameError: name 'error' is not defined

### Further Cleaning

In [None]:
concated_farmers['organic_status'].value_counts()

Org             9948
Dropout         7700
New             2724
Mabagrown       1113
Reinstated       190
Uncertified      149
dropout           33
Under review      10
Name: organic_status, dtype: int64

In [None]:
concated_farmers[concated_farmers['id'].isna()]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code


In [None]:
df = concated_farmers[~concated_farmers['id_number'].isna()][['id_number']]
df.squeeze().is_unique

False

In [None]:
concated_farmers['area'] = concated_farmers['area'].str.title()

In [None]:
concated_farmers['area'] = concated_farmers['area'].replace('Mt Darwin','Mt Darwin')

In [None]:
concated_farmers.shape

(21868, 22)

In [None]:
concated_farmers['area'].value_counts()

Mbire                   8128
Binga                   2655
Rushinga                1750
Mudzi                   1590
Chimanimani             1369
Buhera                  1152
Chipinge                 977
Mt Darwin                869
Makoni                   549
Beitbridge               529
Kwekwe                   270
Hwange                   260
Uzumbamarambapfungwe     257
Chimanimani Rusitu       196
Mwenezi                  173
Karoi                    163
Triangle                 163
Mutoko                   126
Mberengwa                115
Mt Darwin                 99
Chimanimani Tilbury       95
Chivi                     94
Mushumbi                  89
Beitbridge                84
Matobo                    44
Makoni Mt Shalom          43
Hwange                    29
Name: area, dtype: int64

In [None]:
concated_farmers['area'].unique()
# Correct 'chipinge' first upper

array(['Mbire', 'Triangle', 'Karoi', 'Mushumbi', 'Binga', 'Makoni',
       'Chimanimani', 'Chipinge', 'Chivi', 'Buhera', 'Mt Darwin',
       'Beitbridge', 'Mt Darwin ', 'Mudzi', 'Beitbridge ', 'Rushinga',
       'Kwekwe', 'Mwenezi', 'Uzumbamarambapfungwe', 'Mutoko', 'Mberengwa',
       'Matobo', 'Hwange', 'Hwange ', 'Chimanimani Rusitu',
       'Chimanimani Tilbury', 'Makoni Mt Shalom'], dtype=object)

In [None]:
concated_farmers['production_unit'] = concated_farmers['production_unit'].replace('Mashonaland Centra','Mashonaland Central')

In [None]:
concated_farmers['production_unit'].value_counts()

Mashonaland Central        8216
Northern Eastern region    4691
South Eastern region       3498
Matabeleland North         2945
Southern region             995
Manicaland                  883
Midlands                    270
Masvingo                    163
Mashonaland West            163
Matabeleland South           44
Name: production_unit, dtype: int64

In [None]:
concated_farmers['ward_nr'].unique()

# correct to int

array([15.,  6., 13.,  5.,  7.,  2.,  9., 17., 16.,  3.,  4., 27., 12.,
       11., 10.,  8., 14., 19., 18., 23., 21.,  1., 25., 20., 29., 33.,
       24., 28., 30., 31., 34., 32., nan, 22.])

In [None]:
# 'id_number has not unique and has 3254 duplicates'
#concated_farmers['id_number'].is_unique
concated_farmers['id_number'].duplicated().value_counts()

False    18627
True      3241
Name: id_number, dtype: int64

#### Setting values of species, sex, reason_droppen_out and organic_status to lower case

In [None]:
concated_farmers['species'] = concated_farmers['species'].str.lower()
concated_farmers['sex'] = concated_farmers['sex'].str.lower()

concated_farmers['reason_dropped_out'] = concated_farmers['reason_dropped_out'].str.lower()
concated_farmers['organic_status'] = concated_farmers['organic_status'].str.lower()

In [None]:
concated_farmers['organic_status'].unique()

array(['dropout', 'mabagrown', 'new', 'org', 'reinstated', 'uncertified',
       None, 'under review'], dtype=object)

In [None]:
concated_farmers['species'].unique()

# correct to types rosella, chili, paprika,  

# '5 sinakoma', 'robert' correct to None

array(['rosella', 'roseela', 'rosell', 'rosella, chilli, paprika',
       'ronald', '5 sinakoma', 'paprika', 'rosella, strophantus',
       'chilli, paprika', 'paprika, chilli', 'chilli,paprika', 'chillie',
       'chilli', 'chili', 'chillie/paprica ', 'baobab', 'marula',
       'baobab + ximenia', 'trichillia', 'baobab+kms+ximenia',
       'kalahari melon seed', 'ximenia caffra', 'kms + ximenia',
       'ximenia americana', 'devils claw', 'wild gotu kola'], dtype=object)

In [None]:
concated_farmers['species'] = concated_farmers['species'].replace({'5 sinakoma': 'None', 'ronald':  'None','roseela': 'rosella', 'rosell': 'rosella', 
                                                                   'paprika, chilli': 'chilli, paprika', 'chili': 'chilli',
                                                                    'chilli,paprika':'chilli, paprika', 'chillie': 'chilli', 
                                                                    'chillie/paprica ': 'chilli, paprika',
                                                                    'baobab + ximenia': 'baobab, ximenia', 'baobab+kms+ximenia': 'baobab, kms, ximenia',
                                                                    'kalahari melon seed': 'kalahari, melon, seed', 'kms + ximenia': 'kms, ximenia'
                                                                    })

In [None]:
concated_farmers['wc_nr'].duplicated().value_counts()

False    11518
True     10350
Name: wc_nr, dtype: int64

In [None]:
# 11517 wc_nr
concated_farmers[~concated_farmers['wc_nr'].isna()]['wc_nr']

wc_info_bayoba_1_df  0      21146.0
                     1      21833.0
                     2      21147.0
                     3      22986.0
                     4      21148.0
                             ...   
wc_info__WCOrg_1_df  560     1057.0
                     561    21040.0
                     562     2555.0
                     563     2066.0
                     564     1069.0
Name: wc_nr, Length: 11517, dtype: float64

In [None]:
concated_farmers['ward_nr'].duplicated().value_counts()

True     21834
False       34
Name: ward_nr, dtype: int64

In [None]:
concated_farmers['ward_nr'].is_unique

False

In [None]:
concated_farmers['ward_nr'].unique()

array([15.,  6., 13.,  5.,  7.,  2.,  9., 17., 16.,  3.,  4., 27., 12.,
       11., 10.,  8., 14., 19., 18., 23., 21.,  1., 25., 20., 29., 33.,
       24., 28., 30., 31., 34., 32., nan, 22.])

In [None]:
concated_farmers.iloc[0]

id                                                 1545
og_nr                                           13137.0
og_code                                           MBOGR
production_unit                     Mashonaland Central
area                                              Mbire
ward_nr/name                                         15
species                                         rosella
surname                                       Nyamayaro
first_names                                     Phillip
sex                                                male
id_number                                  47-158929E47
date_of_birth                                       NaT
date_contracted                                     NaT
date_dropped_out                    2020-02-05 00:00:00
reason_dropped_out                           own choice
address                     Mahuwe sch, Box 92, Mbire D
organic_status                                  dropout
organic_status_from_date        2017-12-02 00:00

In [None]:
# concated_farmers has 9094 rows × 22 columns

In [None]:
concated_farmers.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9094 entries, ('OGInfo__OGMB_1_df', 0) to ('OG_Info_reserve__OGOrg_1_df', 794)
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        9094 non-null   int64         
 1   og_nr                     9093 non-null   float64       
 2   og_code                   9093 non-null   object        
 3   production_unit           9094 non-null   object        
 4   area                      9094 non-null   object        
 5   ward_nr/name              9093 non-null   object        
 6   species                   9094 non-null   object        
 7   surname                   9094 non-null   object        
 8   first_names               9094 non-null   object        
 9   sex                       9094 non-null   object        
 10  id_number                 7939 non-null   object        
 11  date_of_birth             1

#### Upload the prepared farmer data

table_name = 'all_farmers'

if engine!=None:
    try:
        concated_farmers.to_sql(name=table_name, # Name of SQL table
                                con=engine, # Engine or connection
                                if_exists='replace', # Drop the table before inserting new values 
                                schema=schema, # Use schmea that was defined earlier
                                index=False, # Write DataFrame index as a column
                                chunksize=5000, # Specify the number of rows in each batch to be written at a time
                                method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None