In [12696]:
import pandas as pd
import sql_functions as sf

In [12697]:
schema = 'organic_africa' # UPDATE 'TABLE_SCHEMA' based on schema used in class 
engine = sf.get_engine()

**Importing tables of farmer data as Dataframes**

5 Dfs stored in one Dictionary 

In [12698]:
all_farmers = {}

all_farmers['OGInfo__OGMB_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OGInfo__OGMB_1"')
all_farmers['OG_Info__OGOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OG_Info__OGOrg_1"')
all_farmers['wc_info_bayoba_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_info_bayoba_1"')
all_farmers['wc_info__WCOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_info__WCOrg_1"')
all_farmers['OG_Info_reserve__OGOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OG_Info_reserve__OGOrg_1"')


## Preparing imported farmer dataframes

**Preparing farmer dfs to concat them to one big df**

1. Reduce all farmer df to the necessary ~20 columns 
2. Set all column names to lowercase
3. Renaming equal columns with equal column names
4. Checking and converting data types

In [12699]:
# Reduce all farmer df to the necessary ~20 columns
all_farmers['OGInfo__OGMB_1_df'] = all_farmers['OGInfo__OGMB_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                   'Species', 'Surname', 'First_names', 'Sex', 'ID_Number',
                                                                   'Date_of_Birth', 'date_contracted', 'Date_dropped_out', 'Reason_dropped_out',
                                                                   'Address', 'OG_Organic_Status', 'OG_Organic_Status_from_Date', 'Fairtrade']]

all_farmers['OG_Info__OGOrg_1_df'] = all_farmers['OG_Info__OGOrg_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number', 'Date_of_Birth',
                                                                       'Date_contracted', 'Date_dropped_out', 'Reason_dropped_out', 
                                                                       'OG_Organic_Status', 'Address', 'OG_Organic_Status_from_Date',
                                                                       'Fairtrade','ward_nr']]

all_farmers['wc_info_bayoba_1_df'] = all_farmers['wc_info_bayoba_1_df'][['ID', 'WC_Nr', 'WC_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number', 
                                                                       'Date_contracted', 'Date_dropped_out',
                                                                       'Reason_dropped_out', 'Address', 'Organic_Status', 
                                                                       'Organic_Status_from_Date', 'Fairtrade', 'Date_of_Birth', 'ward_nr']]

all_farmers['wc_info__WCOrg_1_df'] = all_farmers['wc_info__WCOrg_1_df'][['ID', 'WC_Nr', 'WC_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number',
                                                                       'Date_contracted', 'Reason_dropped_out', 'Address', 'Organic_Status', 
                                                                       'Organic_Status_from_Date', 'Fairtrade', 'Date_dropped_out', 'ward_nr']]

all_farmers['OG_Info_reserve__OGOrg_1_df'] = all_farmers['OG_Info_reserve__OGOrg_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 
                                                                                       'Ward_Nr/Name', 'Species', 'Surname', 'First_names',
                                                                                       'Sex', 'ID_Number', 'Date_of_Birth', 'Date_contracted',
                                                                                       'Date_dropped_out', 'Reason_dropped_out', 'Address',
                                                                                       'OG_Organic_Status', 'OG_Organic_Status_from_Date', 
                                                                                       'Fairtrade', 'ward_nr']]

In [12700]:
# Set all column names to lowercase

for df in all_farmers.values():
    #print(df)
    df.columns = map(str.lower, df.columns)

In [12701]:
all_farmers['OGInfo__OGMB_1_df'].columns

Index(['id', 'og_nr', 'og_code', 'production_unit', 'area', 'ward_nr/name',
       'species', 'surname', 'first_names', 'sex', 'id_number',
       'date_of_birth', 'date_contracted', 'date_dropped_out',
       'reason_dropped_out', 'address', 'og_organic_status',
       'og_organic_status_from_date', 'fairtrade'],
      dtype='object')

In [12702]:
# Renaming equal columns with equal column names
all_farmers['OGInfo__OGMB_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)
all_farmers['OG_Info__OGOrg_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)
all_farmers['OG_Info_reserve__OGOrg_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)

**4. Checking and converting data types**

.info() .dtype() type()

pd.to_datetime()

#### OGInfo__OGMB_1

In [12703]:
all_farmers['OGInfo__OGMB_1_df'].info()

# looks good!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8166 entries, 0 to 8165
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        8166 non-null   int64         
 1   og_nr                     8166 non-null   int64         
 2   og_code                   8166 non-null   object        
 3   production_unit           8166 non-null   object        
 4   area                      8166 non-null   object        
 5   ward_nr/name              8166 non-null   int64         
 6   species                   8166 non-null   object        
 7   surname                   8166 non-null   object        
 8   first_names               8166 non-null   object        
 9   sex                       8166 non-null   object        
 10  id_number                 7073 non-null   object        
 11  date_of_birth             657 non-null    datetime64[ns]
 12  date_contracted     

In [12704]:
type(all_farmers['OGInfo__OGMB_1_df']['date_contracted'][2])

pandas._libs.tslibs.timestamps.Timestamp

In [12705]:
all_farmers['OGInfo__OGMB_1_df'][all_farmers['OGInfo__OGMB_1_df']['og_nr']==13190.0]

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade
239,1414,13190,MBOGR,Mashonaland Central,Mbire,6,Rosella,Gurupira,Vengayi,male,71-083974N71,NaT,2147-12-06,2020-03-19,Own choice,"Nyarutomno sch, Box 500, Mbire D",Dropout,2017-12-06 00:00:00.000,False


In [12706]:
# Solve through replacing '2147-12-06' at index 239
all_farmers['OGInfo__OGMB_1_df'].replace('2147-12-06', '2017-12-06', inplace=True)

In [12707]:
all_farmers['OGInfo__OGMB_1_df'][all_farmers['OGInfo__OGMB_1_df']['og_nr']==14304.0]

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade
1383,2587,14304,MBOGR,Mashonaland Central,Mbire,17,Rosella,Makuwerere,Agness,female,63-240806P71,NaT,2018-11-15,2002-05-06,Own choice,Mushumbi Primary School Bag 2016,Dropout,2018-11-15 00:00:00.000,False


In [12708]:
# Solve through replacing '2147-12-06' at index 239
all_farmers['OGInfo__OGMB_1_df'].replace('2002-05-06', '2022-05-06', inplace=True)

In [12709]:
# No wrong birthdate
all_farmers['OGInfo__OGMB_1_df'][all_farmers['OGInfo__OGMB_1_df']['date_of_birth']>'2006-09-02']['date_of_birth']

Series([], Name: date_of_birth, dtype: datetime64[ns])

#### OG_Info__OGOrg_1_df

In [12710]:
all_farmers['OG_Info__OGOrg_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        1403 non-null   int64 
 1   og_nr                     1403 non-null   int64 
 2   og_code                   1403 non-null   object
 3   production_unit           1403 non-null   object
 4   area                      1403 non-null   object
 5   ward_nr/name              1403 non-null   object
 6   species                   1403 non-null   object
 7   surname                   1403 non-null   object
 8   first_names               1403 non-null   object
 9   sex                       1403 non-null   object
 10  id_number                 1310 non-null   object
 11  date_of_birth             930 non-null    object
 12  date_contracted           1403 non-null   object
 13  date_dropped_out          405 non-null    object
 14  reason_dropped_out      

In [12711]:
# converted date_of_birth
all_farmers['OG_Info__OGOrg_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_of_birth'])

Converting 'date_contracted' to datetime

In [12712]:
# all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

# Out of bounds nanosecond timestamp: 218-08-19 00:00:00 present at position 59

# all_farmers['OG_Info__OGOrg_1_df'].iloc[59,]
# all_farmers['OG_Info__OGOrg_1_df'].loc[all_farmers['OG_Info__OGOrg_1_df']['date_contracted']=='218-08-19']

# Did not found 218-08-19 at index 59 but the seems to get corrected in the next steps

In [12713]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[59]['date_contracted']

datetime.date(2012, 12, 3)

In [12714]:
all_farmers['OG_Info__OGOrg_1_df'].replace('218-08-19', '2018-08-19', inplace=True)


In [12715]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[50:60,12]

50    2023-02-24
51    2012-12-06
52    2012-12-06
53    2012-12-06
54    2012-12-06
55    2012-12-06
56    2012-12-03
57    2012-12-03
58    2013-08-08
59    2012-12-03
Name: date_contracted, dtype: object

Solving date_contracted errors with padding strings

In [12716]:
# Function to pad/fill to short dates like 5/3/18 to 05/03/2018

def str_date_pad(val):
    
    if type(val) is str and len(val) < 10: 

        date_splitted = val.split('/')
        
        if len(date_splitted[1]) < 2 : 
            date_splitted[1] = '0' + date_splitted[1]

        if (len(date_splitted[2]) < 4) and (int(date_splitted[2]) < 24) : 
            date_splitted[2] = '20' + date_splitted[2]
        if len(date_splitted[2]) < 4 and int(date_splitted[2]) > 23 : 
            date_splitted[2] = '19' + date_splitted[2]
    
        val = '/'.join(date_splitted)
        #print(val)
        return val
    
    else:
        return val


In [12717]:
# Apply function to df
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = all_farmers['OG_Info__OGOrg_1_df']['date_contracted'].apply(str_date_pad)

In [12718]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[50:60,12]

50    2023-02-24
51    2012-12-06
52    2012-12-06
53    2012-12-06
54    2012-12-06
55    2012-12-06
56    2012-12-03
57    2012-12-03
58    2013-08-08
59    2012-12-03
Name: date_contracted, dtype: object

In [12719]:
# Now after padding convert str to datetime gives still problem !
# all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

In [12720]:
# Problem with date '19/08/19218' at 54 BUT its in position/row_index 741
#all_farmers['OG_Info__OGOrg_1_df'].loc[all_farmers['OG_Info__OGOrg_1_df']['date_contracted']=='19/08/19218']
# cause year '218' padded to '19218'

In [12721]:
# Solve through replacing '19/08/19218' by '19/08/2018' and dropping '01/00/2022'
all_farmers['OG_Info__OGOrg_1_df'].replace('19/08/19218', '19/08/2018', inplace=True)
all_farmers['OG_Info__OGOrg_1_df'].replace('01/00/2022', None, inplace=True)

In [12722]:
# Ready to convert str to datetime
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

In [12723]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[65]['date_contracted']

Timestamp('2012-12-06 00:00:00')

In [12724]:
all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'])

In [12725]:
all_farmers['OG_Info__OGOrg_1_df'].replace('2018-01-09', '2018-09-01', inplace=True)
all_farmers['OG_Info__OGOrg_1_df'][all_farmers['OG_Info__OGOrg_1_df']['og_nr']==19816.0]

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,organic_status,address,organic_status_from_date,fairtrade,ward_nr
756,2174,19816,BNOG,Matabeleland North,Binga,15 Kariangwe,Rosella,Moyo,Joyce,female,06-064378P06,1978-07-07,2018-08-10,2018-09-01,Not externally inspected,Dropout,Manseme Primary School,2018-08-10,False,15


In [12726]:
all_farmers['OG_Info__OGOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        1403 non-null   int64         
 1   og_nr                     1403 non-null   int64         
 2   og_code                   1403 non-null   object        
 3   production_unit           1403 non-null   object        
 4   area                      1403 non-null   object        
 5   ward_nr/name              1403 non-null   object        
 6   species                   1403 non-null   object        
 7   surname                   1403 non-null   object        
 8   first_names               1403 non-null   object        
 9   sex                       1403 non-null   object        
 10  id_number                 1310 non-null   object        
 11  date_of_birth             930 non-null    datetime64[ns]
 12  date_contracted     

In [12727]:
# No wrong birthdate
all_farmers['OG_Info__OGOrg_1_df'][all_farmers['OG_Info__OGOrg_1_df']['date_of_birth']>'2006-09-02']['date_of_birth']

Series([], Name: date_of_birth, dtype: datetime64[ns])

#### wc_info_bayoba_1_df

In [12728]:
bayoba_org_df = all_farmers['wc_info_bayoba_1_df'].copy()

In [12729]:
all_farmers['wc_info_bayoba_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10952 entries, 0 to 10951
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        10952 non-null  int64  
 1   wc_nr                     10952 non-null  int64  
 2   wc_code                   10952 non-null  object 
 3   production_unit           10952 non-null  object 
 4   area                      10952 non-null  object 
 5   ward_nr/name              10535 non-null  object 
 6   species                   10952 non-null  object 
 7   surname                   10951 non-null  object 
 8   first_names               10949 non-null  object 
 9   sex                       10952 non-null  object 
 10  id_number                 10248 non-null  object 
 11  date_contracted           10951 non-null  object 
 12  date_dropped_out          1924 non-null   object 
 13  reason_dropped_out        1054 non-null   object 
 14  addres

In [12730]:
all_farmers['wc_info_bayoba_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_of_birth'])

In [12731]:
all_farmers['wc_info_bayoba_1_df']['date_contracted'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_contracted'])

In [12732]:
all_farmers['wc_info_bayoba_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_dropped_out'])

In [12733]:
all_farmers['wc_info_bayoba_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10952 entries, 0 to 10951
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        10952 non-null  int64         
 1   wc_nr                     10952 non-null  int64         
 2   wc_code                   10952 non-null  object        
 3   production_unit           10952 non-null  object        
 4   area                      10952 non-null  object        
 5   ward_nr/name              10535 non-null  object        
 6   species                   10952 non-null  object        
 7   surname                   10951 non-null  object        
 8   first_names               10949 non-null  object        
 9   sex                       10952 non-null  object        
 10  id_number                 10248 non-null  object        
 11  date_contracted           10951 non-null  datetime64[ns]
 12  date_dropped_out  

In [12734]:
# 37 wrong birthdates from 990 birthdates
all_farmers['wc_info_bayoba_1_df'][all_farmers['wc_info_bayoba_1_df']['date_of_birth']>'2009-09-02']['date_of_birth']

8978    2042-08-09
9011    2049-04-18
9045    2021-07-25
9050    2019-01-18
9436    2021-04-19
9442    2019-01-01
9460    2021-07-23
9638    2016-07-14
9693    2046-08-12
9727    2021-03-18
9857    2045-01-11
9858    2047-08-17
9862    2046-03-02
10194   2048-11-29
10198   2022-03-15
10209   2040-08-10
10224   2049-09-13
10226   2038-02-16
10243   2012-10-22
10255   2046-03-21
10267   2039-03-13
10280   2049-06-17
10282   2046-07-16
10306   2047-02-19
10415   2040-01-10
10438   2045-07-17
10440   2047-08-02
10478   2048-04-09
10484   2048-01-09
10486   2048-08-10
10513   2022-05-17
10544   2022-04-17
10619   2041-08-15
10673   2043-07-07
10758   2047-01-04
10779   2044-07-09
10812   2049-02-11
Name: date_of_birth, dtype: datetime64[ns]

In [12735]:
org_birth = bayoba_org_df[~bayoba_org_df['date_of_birth'].isna()][['wc_nr','date_of_birth']]
merge = pd.merge(org_birth, all_farmers['wc_info_bayoba_1_df'][['wc_nr','date_of_birth']], on='wc_nr', how='left')

# wrong date in original data -> dropping the 37 wrong birthdates
merge[merge['date_of_birth_y']>'2009-09-02'].head()

Unnamed: 0,wc_nr,date_of_birth_x,date_of_birth_y
11,11586,2042-08-09,2042-08-09
24,11555,2049-04-18,2049-04-18
47,11520,2021-07-25,2021-07-25
51,11515,2019-01-18,2019-01-18
68,14026,2021-04-19,2021-04-19


In [12736]:
wc_nr = all_farmers['wc_info_bayoba_1_df'][all_farmers['wc_info_bayoba_1_df']['date_of_birth']>'2009-09-02']['wc_nr']
wc_nr.index

Int64Index([ 8978,  9011,  9045,  9050,  9436,  9442,  9460,  9638,  9693,
             9727,  9857,  9858,  9862, 10194, 10198, 10209, 10224, 10226,
            10243, 10255, 10267, 10280, 10282, 10306, 10415, 10438, 10440,
            10478, 10484, 10486, 10513, 10544, 10619, 10673, 10758, 10779,
            10812],
           dtype='int64')

In [12737]:
#ind = all_farmers['wc_info_bayoba_1_df'][all_farmers['wc_info_bayoba_1_df']['wc_nr'].isin(list(wc_nr))]['date_of_birth'].index

all_farmers['wc_info_bayoba_1_df']['date_of_birth'].iloc[wc_nr.index] = 'NaT'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_farmers['wc_info_bayoba_1_df']['date_of_birth'].iloc[wc_nr.index] = 'NaT'


#### wc_info__WCOrg_1_df

In [12738]:
all_farmers['wc_info__WCOrg_1_df'].info()

# has no date_of_birth column -> 19 columns
# need to convert date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        565 non-null    int64  
 1   wc_nr                     565 non-null    int64  
 2   wc_code                   565 non-null    object 
 3   production_unit           565 non-null    object 
 4   area                      565 non-null    object 
 5   ward_nr/name              563 non-null    object 
 6   species                   565 non-null    object 
 7   surname                   564 non-null    object 
 8   first_names               564 non-null    object 
 9   sex                       565 non-null    object 
 10  id_number                 472 non-null    object 
 11  date_contracted           558 non-null    object 
 12  reason_dropped_out        142 non-null    object 
 13  address                   530 non-null    object 
 14  organic_st

In [12739]:
all_farmers['wc_info__WCOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_contracted'])

In [12740]:
all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'])

  all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'])


In [12741]:
# dropping the date_dropped_out because its before date_contracted and organic_status_from_date
all_farmers['wc_info__WCOrg_1_df'].replace('2016-07-11', 'NaT', inplace=True)
all_farmers['wc_info__WCOrg_1_df'][all_farmers['wc_info__WCOrg_1_df']['wc_nr']==2101.0]

Unnamed: 0,id,wc_nr,wc_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_contracted,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,date_dropped_out,ward_nr
80,3155,2101,CMWC,Manicaland,Chimanimani Tilbury,12,Wild Gotu Kola,Zireni Seda,Florence,female,44-005734T44,2018-10-20,,Tilbury Sch Box Tilbury Chimanimani,Reinstated,2018-09-18,False,NaT,12.0


In [12742]:
all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'])

In [12743]:
all_farmers['wc_info__WCOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        565 non-null    int64         
 1   wc_nr                     565 non-null    int64         
 2   wc_code                   565 non-null    object        
 3   production_unit           565 non-null    object        
 4   area                      565 non-null    object        
 5   ward_nr/name              563 non-null    object        
 6   species                   565 non-null    object        
 7   surname                   564 non-null    object        
 8   first_names               564 non-null    object        
 9   sex                       565 non-null    object        
 10  id_number                 472 non-null    object        
 11  date_contracted           558 non-null    datetime64[ns]
 12  reason_dropped_out    

#### OG_Info_reserve__OGOrg_1_df

In [12744]:
reserve_org_df = all_farmers['OG_Info_reserve__OGOrg_1_df'].copy()

In [12745]:
reserve_org_df['date_contracted'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 795 entries, 0 to 794
Series name: date_contracted
Non-Null Count  Dtype 
--------------  ----- 
795 non-null    object
dtypes: object(1)
memory usage: 6.3+ KB


In [12746]:
all_farmers['OG_Info_reserve__OGOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        795 non-null    int64  
 1   og_nr                     795 non-null    int64  
 2   og_code                   795 non-null    object 
 3   production_unit           795 non-null    object 
 4   area                      795 non-null    object 
 5   ward_nr/name              794 non-null    object 
 6   species                   795 non-null    object 
 7   surname                   795 non-null    object 
 8   first_names               795 non-null    object 
 9   sex                       795 non-null    object 
 10  id_number                 734 non-null    object 
 11  date_of_birth             267 non-null    object 
 12  date_contracted           795 non-null    object 
 13  date_dropped_out          255 non-null    object 
 14  reason_dro

In [12747]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['organic_status_from_date'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['organic_status_from_date'])

In [12748]:
# Apply function to df
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'] = all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'].apply(str_date_pad)

In [12749]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth'])

In [12750]:
all_farmers['OG_Info_reserve__OGOrg_1_df'].replace('19/08/19218', '19/08/2018', inplace=True)
#all_farmers['OG_Info_reserve__OGOrg_1_df'][all_farmers['OG_Info_reserve__OGOrg_1_df']['og_nr']==2101.0]
# all_farmers['OG_Info_reserve__OGOrg_1_df'].iloc[51]

In [12751]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'])

  all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'])


In [12752]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out'])

In [12753]:
all_farmers['OG_Info_reserve__OGOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        795 non-null    int64         
 1   og_nr                     795 non-null    int64         
 2   og_code                   795 non-null    object        
 3   production_unit           795 non-null    object        
 4   area                      795 non-null    object        
 5   ward_nr/name              794 non-null    object        
 6   species                   795 non-null    object        
 7   surname                   795 non-null    object        
 8   first_names               795 non-null    object        
 9   sex                       795 non-null    object        
 10  id_number                 734 non-null    object        
 11  date_of_birth             267 non-null    datetime64[ns]
 12  date_contracted       

#### Correcting date_contracted, that are younger than date_dropped_out 

In [12754]:
reserve_org_df.shape

(795, 20)

In [12755]:
reserve_df = all_farmers['OG_Info_reserve__OGOrg_1_df'].copy()
reserve_df.shape

(795, 20)

In [12756]:
reserve_org_df['date_contracted'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 795 entries, 0 to 794
Series name: date_contracted
Non-Null Count  Dtype 
--------------  ----- 
795 non-null    object
dtypes: object(1)
memory usage: 6.3+ KB


In [12757]:
merged = pd.merge(reserve_df[['og_nr','date_contracted','date_dropped_out','organic_status_from_date']], reserve_org_df[['og_nr','date_contracted','date_dropped_out','organic_status_from_date']],on='og_nr', how='left')

In [12758]:
# x is reserve to_datetime formatted and y is from original reserve table!

merged[['og_nr','date_contracted_x','date_contracted_y']]
# merged[['og_nr','date_dropped_out_x','date_dropped_out_y']]
# merged[['og_nr','organic_status_from_date_x','organic_status_from_date_y']]

Unnamed: 0,og_nr,date_contracted_x,date_contracted_y
0,3463,2013-09-07,09/07/2013
1,4084,2014-02-17,17/02/2014
2,4085,2014-02-17,17/02/2014
3,4088,2014-02-17,17/02/2014
4,5156,2015-08-17,17/08/2015
...,...,...,...
790,20026,2019-01-02,01/02/2019
791,20027,2019-01-02,01/02/2019
792,20028,2019-01-02,01/02/2019
793,14001,2018-08-31,31/8/18


In [12759]:
merged['date_contracted_y'].info()

<class 'pandas.core.series.Series'>
Int64Index: 795 entries, 0 to 794
Series name: date_contracted_y
Non-Null Count  Dtype 
--------------  ----- 
795 non-null    object
dtypes: object(1)
memory usage: 12.4+ KB


In [12760]:
merged[merged['date_contracted_x']!=merged['date_contracted_y']][['og_nr','date_contracted_x','date_contracted_y']]

  merged[merged['date_contracted_x']!=merged['date_contracted_y']][['og_nr','date_contracted_x','date_contracted_y']]


Unnamed: 0,og_nr,date_contracted_x,date_contracted_y
0,3463,2013-09-07,09/07/2013
1,4084,2014-02-17,17/02/2014
2,4085,2014-02-17,17/02/2014
3,4088,2014-02-17,17/02/2014
4,5156,2015-08-17,17/08/2015
...,...,...,...
790,20026,2019-01-02,01/02/2019
791,20027,2019-01-02,01/02/2019
792,20028,2019-01-02,01/02/2019
793,14001,2018-08-31,31/8/18


In [12761]:
negative_duration = reserve_df[reserve_df['date_contracted'] >= reserve_df['date_dropped_out']][['og_nr','date_contracted','date_dropped_out','organic_status_from_date']]
negative_duration

Unnamed: 0,og_nr,date_contracted,date_dropped_out,organic_status_from_date
596,19412,2018-04-10,2018-01-09,2018-10-04
755,19816,2018-10-08,2018-01-09,2018-08-10


In [12762]:
all_farmers['OG_Info_reserve__OGOrg_1_df'][all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'] >= all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out']][['og_nr','date_contracted','date_dropped_out','organic_status_from_date']]

Unnamed: 0,og_nr,date_contracted,date_dropped_out,organic_status_from_date
596,19412,2018-04-10,2018-01-09,2018-10-04
755,19816,2018-10-08,2018-01-09,2018-08-10


In [12763]:

# merged = pd.merge(negative_duration,all_farmers['OG_Info_reserve__OGOrg_1_df'][['og_nr','date_contracted','date_dropped_out','organic_status_from_date']],on='og_nr', how='left')
# compared to original table
# its the same
# merged[['og_nr','date_contracted_x','date_contracted_y']]
# merged[['og_nr','date_dropped_out_x','date_dropped_out_y']]
# merged[['og_nr','organic_status_from_date_x','organic_status_from_date_y']]

In [12764]:
og_list = negative_duration['og_nr']
og_list

596    19412
755    19816
Name: og_nr, dtype: int64

In [12765]:
# [19490.0, 19485.0, 16220.0,...]
# 41 og_nr s to correct

# getting index of filtered rows
ind_org = reserve_df[(reserve_df['organic_status_from_date'] > reserve_df['date_dropped_out']) & (reserve_df['og_nr'].isin(og_list))].index
date1 = reserve_df['date_dropped_out'].iloc[ind_org]
date2 = reserve_df['date_contracted'].iloc[ind_org]
# getting rows in original table to change 
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'].iloc[ind_org] = date1
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out'].iloc[ind_org] = date2
# all_farmers['OG_Info_reserve__OGOrg_1_df'][(all_farmers['OG_Info_reserve__OGOrg_1_df']['organic_status_from_date'] < all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out']) & all_farmers['OG_Info_reserve__OGOrg_1_df']['og_nr'].isin(og_list)]
# all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'][7] = reserve_df['organic_status_from_date'][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'].iloc[ind_org] = date1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out'].iloc[ind_org] = date2


In [12766]:
all_farmers['OG_Info_reserve__OGOrg_1_df'][all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'] >= all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out']][['og_nr','date_contracted','date_dropped_out','organic_status_from_date']]

Unnamed: 0,og_nr,date_contracted,date_dropped_out,organic_status_from_date


##### Dropping wrong birthdates

In [12767]:
all_farmers['OG_Info_reserve__OGOrg_1_df'][all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth']>'2000-09-02']

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr
335,1960,14097,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Munsaka,Sarah,female,06-097909L06,2020-02-12,2018-08-31,NaT,,Nsenga Primary School,Org,2018-08-31,False,5.0
588,2055,19404,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Muleya,Cynthia,female,06-080976E06,2019-10-28,2018-04-10,NaT,,Nsenga Primary School,Org,2018-10-04,False,5.0
589,2056,19405,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Siankale,Jessie,female,06-011048X06,2019-04-05,2018-04-10,NaT,,Dongamuse Primary School Box 48,Org,2018-10-04,False,5.0
590,2057,19406,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Mwembe,Maria,female,06-010831L06,2019-08-07,2018-04-10,2019-01-09,Not externally inspected,Dongamuse Primary School Box 48 Binga,Dropout,2018-10-04,False,5.0
721,2127,19474,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Mupande,Joseph,male,06-033356V06,2019-09-09,2018-05-10,NaT,,Lubu Primary School Box 5729,Org,2018-10-05,False,5.0


In [12768]:
# reserve_org_df[reserve_org_df['og_nr']==14097][['og_nr','date_of_birth','date_contracted']]
# original table has wrong date_of_birth s like 2020-02-12
# deleting 5 date_of_birth s younger 2018

og_nr = all_farmers['OG_Info_reserve__OGOrg_1_df'][all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth']>'2000-09-02']['og_nr']
ind = all_farmers['OG_Info_reserve__OGOrg_1_df'][all_farmers['OG_Info_reserve__OGOrg_1_df']['og_nr'].isin(og_nr)]['date_of_birth'].index
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth'].iloc[ind] = 'NaT'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth'].iloc[ind] = 'NaT'


In [12769]:
all_farmers['OG_Info_reserve__OGOrg_1_df'][all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth']>'2000-09-02']

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr


### Adding Birthdates to wc_info_WCOrg_1_df

In [12770]:
all_birthdates_df = sf.get_dataframe(f'SELECT * FROM {schema}."all_birthdates"')

In [12771]:
dic = set(all_birthdates_df["wc_nr"]) & set(all_farmers['wc_info__WCOrg_1_df'][['wc_nr']].squeeze())

In [12772]:
len(dic)

453

#### Merge all_birthdates_df to wc_info__WCOrg_1_df

In [12773]:
all_farmers['wc_info__WCOrg_1_df'] = pd.merge(all_farmers['wc_info__WCOrg_1_df'], all_birthdates_df, how="left", on="wc_nr")

In [12774]:
all_farmers['wc_info__WCOrg_1_df']

Unnamed: 0,id,wc_nr,wc_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_contracted,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,date_dropped_out,ward_nr,date_of_birth
0,4942,3962,HWWC,Matabeleland North,Hwange,6 Gondwa,Devils Claw,Moyo,Pios,male,,2014-07-07,,Secretariat Sch Vic Falls,Org,,False,NaT,6.0,NaT
1,5608,3692,HWWC,Matabeleland North,Hwange,6 Gondwa,Devils Claw,Ncube,Mwayani,male,79-124391X79,2015-03-14,,"Mironga School, Vic Falls",Org,,False,NaT,6.0,NaT
2,10630,1773,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Ncube,Engel,female,08-161126N39,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-11-11,False,2017-11-21,3.0,NaT
3,1763,1772,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Dube,Samson,Male,79-031011X79,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-06-19,False,NaT,3.0,1957-10-02
4,1766,1775,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Ngwenya,Joshua,Male,79-011290L79,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-06-19,False,NaT,3.0,1955-10-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,11146,1057,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Zaranyika,Vaina,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT
561,11147,21040,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Chibawana,Violet,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT
562,11148,2555,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Hofisi,Christine,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,,False,2021-01-29,21.0,NaT
563,11149,2066,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu Kola,Mudada,Marjorie,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT


In [12775]:
all_farmers['wc_info__WCOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 565 entries, 0 to 564
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        565 non-null    int64         
 1   wc_nr                     565 non-null    int64         
 2   wc_code                   565 non-null    object        
 3   production_unit           565 non-null    object        
 4   area                      565 non-null    object        
 5   ward_nr/name              563 non-null    object        
 6   species                   565 non-null    object        
 7   surname                   564 non-null    object        
 8   first_names               564 non-null    object        
 9   sex                       565 non-null    object        
 10  id_number                 472 non-null    object        
 11  date_contracted           558 non-null    datetime64[ns]
 12  reason_dropped_out    

In [12776]:
all_farmers['wc_info__WCOrg_1_df'][all_farmers['wc_info__WCOrg_1_df']['date_of_birth']>'2000-09-02']

Unnamed: 0,id,wc_nr,wc_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_contracted,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,date_dropped_out,ward_nr,date_of_birth


## Final merge of farmer dfs to concated_farmers

Verify shapes of dfs

In [12777]:
for name, df in all_farmers.items():
    print(name)
    print(df.shape)

OGInfo__OGMB_1_df
(8166, 19)
OG_Info__OGOrg_1_df
(1403, 20)
wc_info_bayoba_1_df
(10952, 20)
wc_info__WCOrg_1_df
(565, 20)
OG_Info_reserve__OGOrg_1_df
(795, 20)


In [12778]:
# ['ward_nr'] column still missing:

all_farmers['OGInfo__OGMB_1_df'][['ward_nr']] = all_farmers['OGInfo__OGMB_1_df'][['ward_nr/name']]
all_farmers['OGInfo__OGMB_1_df'].shape

(8166, 20)

In [12779]:
concated_farmers = pd.concat(all_farmers)

In [12780]:
concated_farmers.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 21881 entries, ('OGInfo__OGMB_1_df', 0) to ('OG_Info_reserve__OGOrg_1_df', 794)
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        21881 non-null  int64         
 1   og_nr                     10364 non-null  float64       
 2   og_code                   10364 non-null  object        
 3   production_unit           21881 non-null  object        
 4   area                      21881 non-null  object        
 5   ward_nr/name              21461 non-null  object        
 6   species                   21881 non-null  object        
 7   surname                   21879 non-null  object        
 8   first_names               21877 non-null  object        
 9   sex                       21881 non-null  object        
 10  id_number                 19837 non-null  object        
 11  date_of_birth             

In [12781]:
concated_farmers['organic_status_from_date'] = pd.to_datetime(concated_farmers['organic_status_from_date'])

  concated_farmers['organic_status_from_date'] = pd.to_datetime(concated_farmers['organic_status_from_date'])
  concated_farmers['organic_status_from_date'] = pd.to_datetime(concated_farmers['organic_status_from_date'])


In [12782]:
concated_farmers[concated_farmers['date_contracted'].dt.year.astype('Int64')>2023]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,145,1517,13107.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Mutape,Sophia,female,...,2174-12-02,NaT,,"Kasawo sch, Box 346, Mbire D",Mabagrown,2017-12-02,False,6.0,,


In [12783]:
concated_farmers.replace('2174-12-02', '2021-07-06', inplace=True)
concated_farmers.replace('2147-12-06', '2021-07-06', inplace=True)

In [12784]:
concated_farmers[concated_farmers['og_nr']==13190.0]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,239,1414,13190.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Gurupira,Vengayi,male,...,2017-12-06,2020-03-19,Own choice,"Nyarutomno sch, Box 500, Mbire D",Dropout,2017-12-06,False,6.0,,


In [12785]:
concated_farmers

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,0,1545,13137.0,MBOGR,Mashonaland Central,Mbire,15,Rosella,Nyamayaro,Phillip,male,...,NaT,2020-02-05,Own choice,"Mahuwe sch, Box 92, Mbire D",Dropout,2017-12-02,False,15.0,,
OGInfo__OGMB_1_df,1,12026,12116.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Funda,Jennie,female,...,2022-04-02,NaT,,"Kasuwo Primary School 346, Mbire",Mabagrown,2022-04-02,False,6.0,,
OGInfo__OGMB_1_df,2,9914,23949.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Chingwena,Gladys,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23,False,13.0,,
OGInfo__OGMB_1_df,3,9915,23950.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Marumbe,Tariro,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23,False,13.0,,
OGInfo__OGMB_1_df,4,9916,23951.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Dambudzo,Ketai,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23,False,13.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OG_Info_reserve__OGOrg_1_df,790,3549,20026.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Nhete,Stella,female,...,2019-01-02,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,791,3550,20027.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Musiiwa,Admire,male,...,2019-01-02,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,792,3551,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Masodzi,Norah,female,...,2019-01-02,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,793,1861,14001.0,BNOG,Matabeleland North,Binga,11 Lubu,Rosella,Mwembe,Mutale,female,...,2018-08-31,NaT,,Lubu primary School P Bag 5729 Binga,Org,2018-08-31,False,11.0,,


# Clean and analyze concated_farmers
1. Drop duplicates (+ wc_nr and og_nr)
2. Adding more birthdates from all_insp_15_23
2. Count number of records by category
3. Clean column by column

In [12786]:
# Looking for Duplicated ROWS
concated_farmers.duplicated().value_counts()

# 133 duplicates

False    21748
True       133
dtype: int64

In [12787]:
# All duplicates in OG_Info_reserve__OGOrg_1_df
df_dup = concated_farmers[concated_farmers.duplicated()].sort_values('id')

ls_dup_og_nr= df_dup['og_nr']
for ognr in ls_dup_og_nr:
    print(concated_farmers[concated_farmers['og_nr'] == ognr])

                                id   og_nr og_code     production_unit   area  \
OG_Info__OGOrg_1_df         91  67  2968.0    BNOG  Matabeleland North  Binga   
OG_Info_reserve__OGOrg_1_df 62  67  2968.0    BNOG  Matabeleland North  Binga   

                               ward_nr/name  species  surname first_names  \
OG_Info__OGOrg_1_df         91   5 Sinakoma  Rosella  Mugande    Georgina   
OG_Info_reserve__OGOrg_1_df 62   5 Sinakoma  Rosella  Mugande    Georgina   

                                   sex  ... date_contracted date_dropped_out  \
OG_Info__OGOrg_1_df         91  female  ...      2013-07-18       2018-09-24   
OG_Info_reserve__OGOrg_1_df 62  female  ...      2013-07-18       2018-09-24   

                               reason_dropped_out  \
OG_Info__OGOrg_1_df         91         Own choice   
OG_Info_reserve__OGOrg_1_df 62         Own choice   

                                                           address  \
OG_Info__OGOrg_1_df         91  Nsenga Primary School

In [12788]:
# Droping duplicates

concated_farmers.drop_duplicates(subset=None, keep='first', inplace=True)
concated_farmers.duplicated().value_counts()

False    21748
dtype: int64

WC is unique

In [12789]:
# 11517 wc_nr
wc_nr_df = concated_farmers[~concated_farmers['wc_nr'].isna()][['wc_nr']]
wc_nr_df.squeeze().is_unique

True

In [12790]:
# 21868 rows have og_nr or wc_nr!
concated_farmers[~concated_farmers['og_nr'].isna() | ~concated_farmers['wc_nr'].isna()].shape

(21748, 22)

In [12791]:
wc_df = concated_farmers[~concated_farmers['wc_nr'].isna()]
wc_df.shape

(11517, 22)

OG is not unique -> drop duplicates

In [12792]:
# 10351 
og_nr_df = concated_farmers[~concated_farmers['og_nr'].isna()][['og_nr', 'date_contracted']]
og_nr_df.shape

(10231, 2)

In [12793]:
# og_nr NOT unique
og_nr_df['og_nr'].squeeze().is_unique

False

In [12794]:
concated_farmers[~concated_farmers['og_nr'].isna()]['og_nr'].duplicated(keep='first').value_counts()

False    9093
True     1138
Name: og_nr, dtype: int64

In [12795]:
concated_farmers['og_nr'].duplicated(keep='first').value_counts()

True     12654
False     9094
Name: og_nr, dtype: int64

In [12796]:
concated_farmers[~concated_farmers['og_nr'].isna()].shape

(10231, 22)

In [12797]:
concated_farmers.sort_values('date_contracted', ascending=False, inplace=True)

In [12798]:
concated_farmers[concated_farmers['og_nr'] == 20028.0][['og_nr', 'date_contracted']]

Unnamed: 0,Unnamed: 1,og_nr,date_contracted
OGInfo__OGMB_1_df,5506,20028.0,2019-02-01
OG_Info__OGOrg_1_df,958,20028.0,2019-02-01
OG_Info_reserve__OGOrg_1_df,792,20028.0,2019-01-02


In [12799]:
df = concated_farmers.copy()
df.shape

(21748, 22)

In [12800]:
df = df[~df['og_nr'].isna()]
df.shape

(10231, 22)

In [12801]:
df = df[~df['og_nr'].isna()].drop_duplicates('og_nr', keep='first')
df.shape

(9093, 22)

In [12802]:
21868-(10351-9093)

20610

In [12803]:
df[df['og_nr'] == 20028.0]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,5506,5614,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Masodzi,Norah,female,...,2019-02-01,2020-05-07,Area closed,"ARDA Mushumbi, Box 210, Mbire",Dropout,2019-02-01,False,3.0,,


In [12804]:
og_not_na_df = df[~df['og_nr'].isna()]
og_not_na_df['og_nr'].squeeze().is_unique

True

In [12805]:
wc_og_df = pd.concat([df, wc_df])
wc_og_df.shape

(20610, 22)

In [12806]:
concated_farmers = wc_og_df.copy()
concated_farmers[concated_farmers['og_nr'] == 20028.0][['og_nr', 'date_contracted']]

Unnamed: 0,Unnamed: 1,og_nr,date_contracted
OGInfo__OGMB_1_df,5506,20028.0,2019-02-01


In [12807]:
concated_farmers['og_nr'].duplicated().value_counts()

True     11516
False     9094
Name: og_nr, dtype: int64

In [12808]:
# Sorting back with index
concated_farmers.sort_index()

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,0,1545,13137.0,MBOGR,Mashonaland Central,Mbire,15,Rosella,Nyamayaro,Phillip,male,...,NaT,2020-02-05,Own choice,"Mahuwe sch, Box 92, Mbire D",Dropout,2017-12-02,False,15.0,,
OGInfo__OGMB_1_df,1,12026,12116.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Funda,Jennie,female,...,2022-04-02,NaT,,"Kasuwo Primary School 346, Mbire",Mabagrown,2022-04-02,False,6.0,,
OGInfo__OGMB_1_df,2,9914,23949.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Chingwena,Gladys,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23,False,13.0,,
OGInfo__OGMB_1_df,3,9915,23950.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Marumbe,Tariro,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23,False,13.0,,
OGInfo__OGMB_1_df,4,9916,23951.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Dambudzo,Ketai,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23,False,13.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wc_info_bayoba_1_df,10947,1131,,,South Eastern region,Chimanimani,5,Baobab,Mugebe,Tradder,male,...,2012-06-26,NaT,,Chibuwe,Org,NaT,False,5.0,21141.0,CMWC
wc_info_bayoba_1_df,10948,1134,,,South Eastern region,Chimanimani,5,Baobab,Chibuwe,Silvia,female,...,2012-06-26,NaT,,Chibuwe,Org,NaT,False,5.0,21144.0,CMWC
wc_info_bayoba_1_df,10949,32782,,,Northern Eastern region,Mt Darwin,,Baobab + ximenia,Nyatito,Lucia,female,...,2022-08-30,NaT,,"Gungwe Primary School Bag 338, Mt Darwin",New,2022-08-30,False,,16499.0,MTWC
wc_info_bayoba_1_df,10950,32783,,,Northern Eastern region,Mt Darwin,,Baobab + ximenia,Kanocheringa,Abgal,female,...,2022-08-30,NaT,,"Gungwe Primary School Bag 338, Mt Darwin",New,2022-08-30,False,,16500.0,MTWC


-----

In [12809]:
# 11517 wc_nr
concated_farmers[~concated_farmers['wc_nr'].isna()]['wc_nr']

wc_info_bayoba_1_df  0      21146.0
                     1      21833.0
                     2      21147.0
                     3      22986.0
                     4      21148.0
                             ...   
wc_info__WCOrg_1_df  560     1057.0
                     561    21040.0
                     562     2555.0
                     563     2066.0
                     564     1069.0
Name: wc_nr, Length: 11517, dtype: float64

#### Adding more birthdates from all_insp_15_23

In [12810]:
all_insp_15_23 = sf.get_dataframe(f'SELECT * FROM {schema}."all_insp_15_23"')
all_insp = all_insp_15_23[~all_insp_15_23['date_of_birth'].isna()][['og_nr','wc_nr','date_of_birth']]

all_insp['date_of_birth']=pd.to_datetime(all_insp['date_of_birth'])

# 4453 Birthdates from all_insp table
all_insp.shape

(4453, 3)

In [12811]:
all_insp['og_nr'].is_unique

False

In [12812]:
all_insp['wc_nr'].is_unique

False

In [12813]:
all_insp = all_insp.reset_index()

In [12814]:
all_insp = all_insp.drop('index', axis=1)
all_insp

Unnamed: 0,og_nr,wc_nr,date_of_birth
0,3463.0,,1971-06-30
1,3671.0,,1971-10-31
2,15984.0,,1978-06-21
3,13632.0,,1965-10-05
4,13627.0,,1979-02-07
...,...,...,...
4448,,44159.0,1987-06-18
4449,,44173.0,1974-03-18
4450,,44186.0,1973-11-01
4451,,44733.0,1978-08-22


In [12815]:
all_insp[(~all_insp['og_nr'].isna()) & (~all_insp['wc_nr'].isna())]

Unnamed: 0,og_nr,wc_nr,date_of_birth


In [12816]:
all_insp[(~all_insp['og_nr'].isna())].shape

(4205, 3)

In [12817]:
all_insp[(~all_insp['wc_nr'].isna())].shape

(247, 3)

In [12818]:
# taking OUT OG_nr and WC_nr NAs

og = all_insp[~all_insp['og_nr'].isna()]['og_nr']
og.shape
# 4205 rows left of og

(4205,)

In [12819]:
og_nr = og.unique()
og_nr

array([ 3463.,  3671., 15984., ..., 13966., 16191., 16088.])

In [12820]:
# All_insp table there are total 2247 unique og_nrs 
all_insp[~all_insp['og_nr'].isna()]['og_nr'].unique().shape

(2247,)

In [12821]:
# 1958 Duplicates to drop

# .duplicated() marks duplicates as true except for the first occurrence

og[og.duplicated()].shape

(1958,)

In [12822]:
# 638 UNIQUE OG_nr s
og[og.duplicated()].unique().shape

(638,)

In [12823]:
# Sorting 2596 data by OG_nr
dup_og_df = all_insp[all_insp['og_nr'].isin(og[og.duplicated()])].sort_values('og_nr')
dup_og_df

Unnamed: 0,og_nr,wc_nr,date_of_birth
2772,2770.0,,1958-07-17
1326,2770.0,,1960-06-06
1340,2770.0,,1960-06-06
4307,2770.0,,1958-07-17
3944,2770.0,,1958-07-17
...,...,...,...
1856,20375.0,,1966-01-04
1781,21501.0,,1984-07-07
1044,21501.0,,1986-08-20
2243,21519.0,,1982-05-01


In [12824]:
# taking OUT OG_nr and WC_nr NAs

wc = all_insp[~all_insp['wc_nr'].isna()]['wc_nr']
wc.shape
# 247 rows left of wc

(247,)

In [12825]:
wc_nr = wc.unique()

In [12826]:
# All_insp table there are total 209 unique wc_nrs 
all_insp[~all_insp['wc_nr'].isna()]['wc_nr'].unique().shape

(209,)

In [12827]:
# Only 38 WC_nr s are duplicates of 36 unique wc_nrs but in all_insp table there are total 209 unique wc_nrs 
wc[wc.duplicated()].shape

(38,)

In [12828]:
# 36 UNIQUE wc_nr s
wc[wc.duplicated()].unique().shape

(36,)

In [12829]:
# sorting 247 data by WC_nr
dup_wc_df = all_insp[all_insp['wc_nr'].isin(wc[wc.duplicated()])].sort_values('wc_nr')
dup_wc_df

Unnamed: 0,og_nr,wc_nr,date_of_birth
3714,,1772.0,1957-11-02
3859,,1772.0,1957-10-02
3720,,2936.0,1955-10-05
4442,,2936.0,1970-05-09
4443,,3674.0,1968-11-16
...,...,...,...
3746,,42461.0,1969-10-06
3874,,42462.0,1969-03-04
3747,,42462.0,1969-03-04
3879,,42876.0,1957-08-08


In [12830]:
 # taking from all duplicates of og_nr that birthdate with most occurance
 # AND setting it to the first index

 for og in og_nr:
    
    birthday = all_insp[all_insp['og_nr']==og]['date_of_birth'].value_counts().idxmax()

    ind = all_insp[all_insp['og_nr']==og].index.min()

    all_insp.date_of_birth.iat[ind] = birthday

In [12831]:
all_insp.shape

(4453, 3)

In [12832]:
clean_og = all_insp[~all_insp['og_nr'].isna()].drop_duplicates('og_nr', keep='first')
clean_og

Unnamed: 0,og_nr,wc_nr,date_of_birth
0,3463.0,,1971-06-13
1,3671.0,,1971-10-31
2,15984.0,,1978-06-21
3,13632.0,,1965-10-05
4,13627.0,,1979-02-07
...,...,...,...
4434,16203.0,,1983-01-01
4435,16151.0,,1974-01-01
4437,13966.0,,1960-09-06
4439,16191.0,,1956-08-08


In [12833]:
 # taking from all duplicates of wc_nr that birthdate with most occurance
 # AND setting it to the first index

 for wc in wc_nr:
    birthday = all_insp[all_insp['wc_nr']==wc]['date_of_birth'].value_counts().idxmax()
   
    ind = all_insp[all_insp['wc_nr']==wc].index.min()

    all_insp.date_of_birth.iat[ind] = birthday

In [12834]:
clean_wc = all_insp[~all_insp['wc_nr'].isna()].drop_duplicates('wc_nr', keep='first')
clean_wc

Unnamed: 0,og_nr,wc_nr,date_of_birth
683,,21354.0,1968-12-31
742,,20854.0,1961-10-27
754,,20856.0,1981-12-17
770,,20868.0,1972-01-03
862,,21146.0,1968-09-12
...,...,...,...
4448,,44159.0,1987-06-18
4449,,44173.0,1974-03-18
4450,,44186.0,1973-11-01
4451,,44733.0,1978-08-22


In [12835]:
clean_wc.wc_nr

683     21354.0
742     20854.0
754     20856.0
770     20868.0
862     21146.0
         ...   
4448    44159.0
4449    44173.0
4450    44186.0
4451    44733.0
4452    44745.0
Name: wc_nr, Length: 209, dtype: float64

In [12836]:
concated_farmers = concated_farmers.reset_index()

In [12837]:
concated_farmers.rename(columns={"level_0": "org_table", "level_1": "org_table_ind"}, inplace=True)

In [12838]:
# 18101 Farmers WITHOUT date_of_birth
concated_farmers[concated_farmers['date_of_birth'].isna()].shape

(18101, 24)

In [12839]:
# 2509 Farmers WITH date_of_birth = 1517 OG + 992 WC
concated_farmers[~concated_farmers['date_of_birth'].isna()].shape

(2509, 24)

In [12840]:
concated_farmers[~concated_farmers['date_of_birth'].isna() & concated_farmers['og_nr'].isna()].shape

(992, 24)

In [12841]:
# 9 WCs from insp have already birthdates in farmer table -> from 209 birthdates of insp table 200 can be filled in farmer table
con_wc = concated_farmers[(~concated_farmers['date_of_birth'].isna()) & (concated_farmers['wc_nr'].isin(clean_wc.wc_nr))][['wc_nr','date_of_birth']]

# FROM 9 already existing birthdates 2 birthdates are DIFFERENT for equal og_nr in both tables 2930.0 and 2923.0
set(con_wc.sort_values('wc_nr').date_of_birth) - set(clean_wc[clean_wc.wc_nr.isin(con_wc.wc_nr)].sort_values('wc_nr').date_of_birth)

# ignore it, cause its ratio is low 2/3763 < 0,05%

{Timestamp('1952-01-01 00:00:00'),
 Timestamp('1957-10-02 00:00:00'),
 Timestamp('1964-12-05 00:00:00'),
 Timestamp('1970-05-09 00:00:00')}

In [12842]:
136/3754

0.036228023441662226

In [12843]:
concated_farmers['og_nr'].is_unique

False

In [12844]:
2247-1202

1045

In [12845]:
# 1202 OGs from insp have already birthdates in farmer table -> from 2247 birthdates of insp table only 1045 can be filled in farmer table
con_og = concated_farmers[(~concated_farmers['date_of_birth'].isna()) & (concated_farmers['og_nr'].isin(clean_og.og_nr))][['og_nr','date_of_birth']]

# FROM 1202 already existing birthdates 136 birthdates are DIFFERENT for equal og_nr in both tables
same_ogs = set(con_og.sort_values('og_nr').date_of_birth) - set(clean_og[clean_og.og_nr.isin(con_og.og_nr)].sort_values('og_nr').date_of_birth)
len(same_ogs)

# ignoring it even its ratio is 148/3763 < 3,6% of total, cause cant find out which table is right

136

In [12846]:
# 1029 birthdates to fill out from clean_og - 1045
concated_farmers[(concated_farmers['date_of_birth'].isna()) & (concated_farmers['og_nr'].isin(clean_og.og_nr))][['og_nr','date_of_birth']]

Unnamed: 0,og_nr,date_of_birth
38,20245.0,NaT
162,20202.0,NaT
165,20204.0,NaT
213,20398.0,NaT
345,20120.0,NaT
...,...,...
9084,11548.0,NaT
9085,11549.0,NaT
9086,11547.0,NaT
9087,11543.0,NaT


In [12847]:
# 169 birthdates to fill out from clean_wc - 200
concated_farmers[(concated_farmers['date_of_birth'].isna()) & (concated_farmers['wc_nr'].isin(clean_wc.wc_nr))][['wc_nr','date_of_birth']].shape

(169, 2)

In [12848]:
# 1214 birthdates to add
1045+169

1214

In [12849]:
# 2509 existing birthdates
concated_farmers[~concated_farmers['date_of_birth'].isna()].shape

(2509, 24)

In [12850]:
ind = concated_farmers[(concated_farmers['date_of_birth'].isna()) & (concated_farmers['wc_nr'].isin(clean_wc.wc_nr))].index

wc = concated_farmers[(concated_farmers['date_of_birth'].isna()) & (concated_farmers['wc_nr'].isin(clean_wc.wc_nr))]
wc = wc.wc_nr.to_list()

for i in range(len(ind)):
    birthday = clean_wc[clean_wc['wc_nr'] == wc[i]]['date_of_birth'].to_list()[0]
    concated_farmers.date_of_birth.iat[ind[i]] = birthday

In [12851]:
ind = concated_farmers[(concated_farmers['date_of_birth'].isna()) & (concated_farmers['og_nr'].isin(clean_og.og_nr))].index

og = concated_farmers[(concated_farmers['date_of_birth'].isna()) & (concated_farmers['og_nr'].isin(clean_og.og_nr))]
og = og.og_nr.to_list()

for i in range(len(ind)):
    birthday = clean_og[clean_og['og_nr'] == og[i]]['date_of_birth'].to_list()[0]
    concated_farmers.date_of_birth.iat[ind[i]] = birthday

In [12852]:
# 1198 new added birthdates from all_insp
concated_farmers[~concated_farmers['date_of_birth'].isna()].shape

(3707, 24)

In [12853]:
3707-2509

1198

In [12854]:
concated_farmers.set_index(['org_table','org_table_ind'])

Unnamed: 0_level_0,Unnamed: 1_level_0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
org_table,org_table_ind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
OG_Info__OGOrg_1_df,1052,3886,20183.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Muleya,Adolia,female,...,2023-02-24,NaT,,"Nsenga Primary School Box 46, Binga",New,2023-02-24,True,5.0,,
OG_Info__OGOrg_1_df,1151,3993,20290.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Chuma,Sophia,female,...,2023-02-24,NaT,,,New,2023-02-24,True,5.0,,
OG_Info__OGOrg_1_df,1131,3957,20254.0,BNOG,Matabeleland North,Binga,5 Sinakoma,5 Sinakoma,Mukucha,Joyce,female,...,2023-02-24,NaT,,"Nsungwale Primary School, Binga",New,2023-02-24,True,5.0,,
OG_Info__OGOrg_1_df,1132,3958,20255.0,BNOG,Matabeleland North,Binga,5 Sinakoma,5 Sinakoma,Mweembe,Leziya,female,...,2023-02-24,NaT,,"Nsungwale Primary School, Binga",New,2023-02-24,True,5.0,,
OG_Info__OGOrg_1_df,1133,3959,20256.0,BNOG,Matabeleland North,Binga,5 Sinakoma,5 Sinakoma,Muchindu,Sophia,female,...,2023-02-24,NaT,,"Nsungwale Primary School, Binga",New,2023-02-24,True,5.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wc_info__WCOrg_1_df,560,11146,,,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Zaranyika,Vaina,female,...,NaT,2021-01-29,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,21.0,1057.0,CMWC
wc_info__WCOrg_1_df,561,11147,,,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Chibawana,Violet,female,...,NaT,2021-01-29,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,21.0,21040.0,CMWC
wc_info__WCOrg_1_df,562,11148,,,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Hofisi,Christine,female,...,NaT,2021-01-29,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,NaT,False,21.0,2555.0,CMWC
wc_info__WCOrg_1_df,563,11149,,,Manicaland,Chimanimani Rusitu,21,Wild Gotu Kola,Mudada,Marjorie,female,...,NaT,2021-01-29,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,21.0,2066.0,CMWC


### Adding column contract duration - about 9289 durations to calculate

In [12855]:
# 9289 row with date_contracted and date_dropped_out
concated_farmers[~concated_farmers['date_contracted'].isna() & ~concated_farmers['date_dropped_out'].isna()][['date_contracted','date_dropped_out']]

Unnamed: 0,date_contracted,date_dropped_out
613,2022-06-02,2022-06-02
766,2022-02-14,2022-05-04
775,2022-02-07,2022-05-04
777,2022-02-07,2022-05-04
779,2022-01-24,2022-07-18
...,...,...
20598,2018-10-20,2021-01-29
20599,2018-10-20,2021-01-29
20600,2018-10-20,2021-01-29
20601,2018-10-20,2021-01-29


In [12856]:
concated_farmers['contract_duration'] = concated_farmers['date_dropped_out'] - concated_farmers['date_contracted']
concated_farmers[~concated_farmers['contract_duration'].isna()][['og_nr','wc_nr','date_contracted','date_dropped_out','contract_duration','organic_status_from_date']]

Unnamed: 0,og_nr,wc_nr,date_contracted,date_dropped_out,contract_duration,organic_status_from_date
613,26419.0,,2022-06-02,2022-06-02,0 days,2022-06-02
766,11848.0,,2022-02-14,2022-05-04,79 days,2022-02-14
775,11836.0,,2022-02-07,2022-05-04,86 days,2022-03-07
777,11833.0,,2022-02-07,2022-05-04,86 days,2022-02-07
779,11825.0,,2022-01-24,2022-07-18,175 days,2022-01-24
...,...,...,...,...,...,...
20598,,4261.0,2018-10-20,2021-01-29,832 days,2018-10-20
20599,,4262.0,2018-10-20,2021-01-29,832 days,2018-10-20
20600,,4263.0,2018-10-20,2021-01-29,832 days,2018-10-20
20601,,4264.0,2018-10-20,2021-01-29,832 days,2018-10-20


In [12857]:
concated_farmers['contract_duration'].mean()

Timedelta('627 days 06:18:17.201894104')

### Adding column age at date_contracted - about 3707 ages

In [12858]:
concated_farmers['age_at_date_contracted'] =  2023 - pd.DatetimeIndex(concated_farmers['date_of_birth']).year
concated_farmers[~concated_farmers['age_at_date_contracted'].isna()][['og_nr','wc_nr','age_at_date_contracted','date_of_birth','date_contracted']]

Unnamed: 0,og_nr,wc_nr,age_at_date_contracted,date_of_birth,date_contracted
1,20290.0,,66.0,1957-07-10,2023-02-24
2,20254.0,,43.0,1980-03-03,2023-02-24
6,20258.0,,27.0,1996-07-01,2023-02-24
7,20259.0,,26.0,1997-10-08,2023-02-24
8,20260.0,,53.0,1970-12-23,2023-02-24
...,...,...,...,...,...
20462,,2936.0,53.0,1970-05-09,2013-07-13
20464,,4143.0,60.0,1963-02-02,2018-09-18
20465,,4144.0,51.0,1972-09-01,2017-07-01
20466,,4145.0,61.0,1962-04-04,2017-07-01


### Further Cleaning

In [12859]:
concated_farmers['organic_status'].value_counts()

Org             9532
Dropout         6951
New             2697
Mabagrown       1047
Reinstated       190
Uncertified      149
dropout           33
Under review      10
Name: organic_status, dtype: int64

In [12860]:
concated_farmers[concated_farmers['id'].isna()]

Unnamed: 0,org_table,org_table_ind,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,...,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code,contract_duration,age_at_date_contracted


In [12861]:
df = concated_farmers[~concated_farmers['id_number'].isna()][['id_number']]
df.squeeze().is_unique

False

In [12862]:
concated_farmers['area'] = concated_farmers['area'].str.title()

In [12863]:
concated_farmers['area'].replace('Mt Darwin ','Mt Darwin', inplace=True)
concated_farmers['area'].replace('Beitbridge ','Beitbridge', inplace=True)
concated_farmers['area'].replace('Hwange ','Hwange', inplace=True)
concated_farmers['area'].replace('Makoni Mt Shalom','Makoni', inplace=True)
concated_farmers['area'].replace('Chimanimani Rusitu','Chimanimani', inplace=True)
concated_farmers['area'].replace('Chimanimani Tilbury','Chimanimani', inplace=True)

In [12864]:
concated_farmers['area'].value_counts()

Mbire                   7683
Binga                   2197
Rushinga                1750
Chimanimani             1660
Mudzi                   1590
Buhera                  1152
Chipinge                 977
Mt Darwin                968
Beitbridge               613
Makoni                   297
Hwange                   289
Kwekwe                   270
Uzumbamarambapfungwe     257
Mwenezi                  173
Karoi                    163
Triangle                 163
Mutoko                   126
Mberengwa                115
Chivi                     94
Matobo                    44
Mushumbi                  29
Name: area, dtype: int64

In [12865]:
concated_farmers['area'].unique()
# Correct 'chipinge' first upper

array(['Binga', 'Mbire', 'Mushumbi', 'Triangle', 'Karoi', 'Makoni',
       'Chimanimani', 'Chipinge', 'Chivi', 'Buhera', 'Mt Darwin',
       'Beitbridge', 'Mudzi', 'Rushinga', 'Kwekwe', 'Mwenezi',
       'Uzumbamarambapfungwe', 'Mutoko', 'Mberengwa', 'Matobo', 'Hwange'],
      dtype=object)

In [12866]:
concated_farmers['production_unit'] = concated_farmers['production_unit'].replace('Mashonaland Centra','Mashonaland Central')

In [12867]:
concated_farmers['production_unit'].value_counts()

Mashonaland Central        7711
Northern Eastern region    4691
South Eastern region       3498
Matabeleland North         2487
Southern region             995
Manicaland                  588
Midlands                    270
Masvingo                    163
Mashonaland West            163
Matabeleland South           44
Name: production_unit, dtype: int64

In [12868]:
concated_farmers['ward_nr'].unique()

# correct to int

array([ 5., 11.,  7., 15.,  2.,  3., 16., 10.,  8.,  9.,  6., 17., 12.,
        4., 13., 14., 27., 19., 21., 18., 23.,  1., 25., 20., 29., 33.,
       24., 28., 30., 31., 34., 32., nan, 22.])

In [12869]:
# 'id_number has not unique and has 3254 duplicates'
#concated_farmers['id_number'].is_unique
concated_farmers['id_number'].duplicated().value_counts()

False    18206
True      2404
Name: id_number, dtype: int64

#### Setting values of species, sex, reason_droppen_out and organic_status to lower case

In [12870]:
concated_farmers['species'] = concated_farmers['species'].str.lower()
concated_farmers['sex'] = concated_farmers['sex'].str.lower()

concated_farmers['reason_dropped_out'] = concated_farmers['reason_dropped_out'].str.lower()
concated_farmers['organic_status'] = concated_farmers['organic_status'].str.lower()

In [12871]:
concated_farmers['organic_status'].unique()

array(['new', 'mabagrown', 'dropout', 'org', 'under review', 'reinstated',
       'uncertified', None], dtype=object)

In [12872]:
concated_farmers['species'].unique()

# correct to types rosella, chili, paprika,  

# '5 sinakoma', 'robert' correct to None

array(['rosella', '5 sinakoma', 'rosell', 'ronald', 'roseela',
       'rosella, chilli, paprika', 'paprika', 'chillie',
       'chillie/paprica ', 'chilli', 'chili', 'paprika, chilli',
       'chilli, paprika', 'chilli,paprika', 'rosella, strophantus',
       'baobab', 'marula', 'baobab + ximenia', 'trichillia',
       'baobab+kms+ximenia', 'kalahari melon seed', 'ximenia caffra',
       'kms + ximenia', 'ximenia americana', 'devils claw',
       'wild gotu kola'], dtype=object)

In [12873]:
concated_farmers['species'] = concated_farmers['species'].replace({'5 sinakoma': 'None', 'ronald':  'None','roseela': 'rosella', 'rosell': 'rosella', 
                                                                   'paprika, chilli': 'chilli, paprika', 'chili': 'chilli',
                                                                    'chilli,paprika':'chilli, paprika', 'chillie': 'chilli', 
                                                                    'chillie/paprica ': 'chilli, paprika',
                                                                    'baobab + ximenia': 'baobab, ximenia', 'baobab+kms+ximenia': 'baobab, kms, ximenia',
                                                                    'kalahari melon seed': 'kalahari, melon, seed', 'kms + ximenia': 'kms, ximenia'
                                                                    })

In [12874]:
concated_farmers['wc_nr'].duplicated().value_counts()

False    11518
True      9092
Name: wc_nr, dtype: int64

In [12875]:
# 11517 wc_nr
concated_farmers[~concated_farmers['wc_nr'].isna()]['wc_nr']

9093     21146.0
9094     21833.0
9095     21147.0
9096     22986.0
9097     21148.0
          ...   
20605     1057.0
20606    21040.0
20607     2555.0
20608     2066.0
20609     1069.0
Name: wc_nr, Length: 11517, dtype: float64

In [12876]:
concated_farmers['ward_nr'].duplicated().value_counts()

True     20576
False       34
Name: ward_nr, dtype: int64

In [12877]:
concated_farmers['ward_nr'].is_unique

False

In [12878]:
concated_farmers['ward_nr'].unique()

array([ 5., 11.,  7., 15.,  2.,  3., 16., 10.,  8.,  9.,  6., 17., 12.,
        4., 13., 14., 27., 19., 21., 18., 23.,  1., 25., 20., 29., 33.,
       24., 28., 30., 31., 34., 32., nan, 22.])

In [12879]:
concated_farmers.iloc[0]

org_table                                   OG_Info__OGOrg_1_df
org_table_ind                                              1052
id                                                         3886
og_nr                                                   20183.0
og_code                                                    BNOG
production_unit                              Matabeleland North
area                                                      Binga
ward_nr/name                                         5 Sinakoma
species                                                 rosella
surname                                                  Muleya
first_names                                              Adolia
sex                                                      female
id_number                                          06-093949G06
date_of_birth                                               NaT
date_contracted                             2023-02-24 00:00:00
date_dropped_out                        

In [12880]:
concated_farmers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20610 entries, 0 to 20609
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype          
---  ------                    --------------  -----          
 0   org_table                 20610 non-null  object         
 1   org_table_ind             20610 non-null  int64          
 2   id                        20610 non-null  int64          
 3   og_nr                     9093 non-null   float64        
 4   og_code                   9093 non-null   object         
 5   production_unit           20610 non-null  object         
 6   area                      20610 non-null  object         
 7   ward_nr/name              20191 non-null  object         
 8   species                   20610 non-null  object         
 9   surname                   20608 non-null  object         
 10  first_names               20606 non-null  object         
 11  sex                       20610 non-null  object         
 12  id_n

In [12881]:
concated_farmers.head()

Unnamed: 0,org_table,org_table_ind,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,...,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code,contract_duration,age_at_date_contracted
0,OG_Info__OGOrg_1_df,1052,3886,20183.0,BNOG,Matabeleland North,Binga,5 Sinakoma,rosella,Muleya,...,,"Nsenga Primary School Box 46, Binga",new,2023-02-24,True,5.0,,,NaT,
1,OG_Info__OGOrg_1_df,1151,3993,20290.0,BNOG,Matabeleland North,Binga,5 Sinakoma,rosella,Chuma,...,,,new,2023-02-24,True,5.0,,,NaT,66.0
2,OG_Info__OGOrg_1_df,1131,3957,20254.0,BNOG,Matabeleland North,Binga,5 Sinakoma,,Mukucha,...,,"Nsungwale Primary School, Binga",new,2023-02-24,True,5.0,,,NaT,43.0
3,OG_Info__OGOrg_1_df,1132,3958,20255.0,BNOG,Matabeleland North,Binga,5 Sinakoma,,Mweembe,...,,"Nsungwale Primary School, Binga",new,2023-02-24,True,5.0,,,NaT,
4,OG_Info__OGOrg_1_df,1133,3959,20256.0,BNOG,Matabeleland North,Binga,5 Sinakoma,,Muchindu,...,,"Nsungwale Primary School, Binga",new,2023-02-24,True,5.0,,,NaT,


#### Upload the prepared farmer data

In [2975]:
table_name = 'all_farmers'

if engine!=None:
    try:
        concated_farmers.to_sql(name=table_name, # Name of SQL table
                                con=engine, # Engine or connection
                                if_exists='replace', # Drop the table before inserting new values 
                                schema=schema, # Use schmea that was defined earlier
                                index=False, # Write DataFrame index as a column
                                chunksize=5000, # Specify the number of rows in each batch to be written at a time
                                method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

  concated_farmers.to_sql(name=table_name, # Name of SQL table


The all_farmers table was imported successfully.
