In [2850]:
import pandas as pd
import sql_functions as sf

In [2851]:
schema = 'organic_africa' # UPDATE 'TABLE_SCHEMA' based on schema used in class 
engine = sf.get_engine()

**Importing tables of farmer data as Dataframes**

5 Dfs stored in one Dictionary 

In [2852]:
all_farmers = {}

all_farmers['OGInfo__OGMB_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OGInfo__OGMB_1"')
all_farmers['OG_Info__OGOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OG_Info__OGOrg_1"')
all_farmers['wc_info_bayoba_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_info_bayoba_1"')
all_farmers['wc_info__WCOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."wc_info__WCOrg_1"')
all_farmers['OG_Info_reserve__OGOrg_1_df'] = sf.get_dataframe(f'SELECT * FROM {schema}."OG_Info_reserve__OGOrg_1"')


## Preparing imported farmer dataframes

**Preparing farmer dfs to concat them to one big df**

1. Reduce all farmer df to the necessary ~20 columns 
2. Set all column names to lowercase
3. Renaming equal columns with equal column names
4. Checking and converting data types

In [2853]:
# Reduce all farmer df to the necessary ~20 columns
all_farmers['OGInfo__OGMB_1_df'] = all_farmers['OGInfo__OGMB_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                   'Species', 'Surname', 'First_names', 'Sex', 'ID_Number',
                                                                   'Date_of_Birth', 'date_contracted', 'Date_dropped_out', 'Reason_dropped_out',
                                                                   'Address', 'OG_Organic_Status', 'OG_Organic_Status_from_Date', 'Fairtrade']]

all_farmers['OG_Info__OGOrg_1_df'] = all_farmers['OG_Info__OGOrg_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number', 'Date_of_Birth',
                                                                       'Date_contracted', 'Date_dropped_out', 'Reason_dropped_out', 
                                                                       'OG_Organic_Status', 'Address', 'OG_Organic_Status_from_Date',
                                                                       'Fairtrade','ward_nr']]

all_farmers['wc_info_bayoba_1_df'] = all_farmers['wc_info_bayoba_1_df'][['ID', 'WC_Nr', 'WC_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number', 
                                                                       'Date_contracted', 'Date_dropped_out',
                                                                       'Reason_dropped_out', 'Address', 'Organic_Status', 
                                                                       'Organic_Status_from_Date', 'Fairtrade', 'Date_of_Birth', 'ward_nr']]

all_farmers['wc_info__WCOrg_1_df'] = all_farmers['wc_info__WCOrg_1_df'][['ID', 'WC_Nr', 'WC_code', 'Production_Unit', 'Area', 'Ward_Nr/Name',
                                                                       'Species', 'Surname', 'First_names', 'Sex', 'ID_Number',
                                                                       'Date_contracted', 'Reason_dropped_out', 'Address', 'Organic_Status', 
                                                                       'Organic_Status_from_Date', 'Fairtrade', 'Date_dropped_out', 'ward_nr']]

all_farmers['OG_Info_reserve__OGOrg_1_df'] = all_farmers['OG_Info_reserve__OGOrg_1_df'][['ID', 'OG_Nr', 'OG_code', 'Production_Unit', 'Area', 
                                                                                       'Ward_Nr/Name', 'Species', 'Surname', 'First_names',
                                                                                       'Sex', 'ID_Number', 'Date_of_Birth', 'Date_contracted',
                                                                                       'Date_dropped_out', 'Reason_dropped_out', 'Address',
                                                                                       'OG_Organic_Status', 'OG_Organic_Status_from_Date', 
                                                                                       'Fairtrade', 'ward_nr']]

In [2854]:
# Set all column names to lowercase

for df in all_farmers.values():
    #print(df)
    df.columns = map(str.lower, df.columns)

In [2855]:
all_farmers['OGInfo__OGMB_1_df'].columns

Index(['id', 'og_nr', 'og_code', 'production_unit', 'area', 'ward_nr/name',
       'species', 'surname', 'first_names', 'sex', 'id_number',
       'date_of_birth', 'date_contracted', 'date_dropped_out',
       'reason_dropped_out', 'address', 'og_organic_status',
       'og_organic_status_from_date', 'fairtrade'],
      dtype='object')

In [2856]:
# Renaming equal columns with equal column names
all_farmers['OGInfo__OGMB_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)
all_farmers['OG_Info__OGOrg_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)
all_farmers['OG_Info_reserve__OGOrg_1_df'].rename(columns={'og_organic_status':'organic_status', 'og_organic_status_from_date':'organic_status_from_date'}, inplace=True)

**4. Checking and converting data types**

.info() .dtype() type()

pd.to_datetime()

#### OGInfo__OGMB_1

In [2857]:
all_farmers['OGInfo__OGMB_1_df'].info()

# looks good!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8166 entries, 0 to 8165
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        8166 non-null   int64         
 1   og_nr                     8166 non-null   int64         
 2   og_code                   8166 non-null   object        
 3   production_unit           8166 non-null   object        
 4   area                      8166 non-null   object        
 5   ward_nr/name              8166 non-null   int64         
 6   species                   8166 non-null   object        
 7   surname                   8166 non-null   object        
 8   first_names               8166 non-null   object        
 9   sex                       8166 non-null   object        
 10  id_number                 7073 non-null   object        
 11  date_of_birth             657 non-null    datetime64[ns]
 12  date_contracted     

In [2858]:
type(all_farmers['OGInfo__OGMB_1_df']['date_contracted'][2])

pandas._libs.tslibs.timestamps.Timestamp

In [2859]:
# all_farmers['OGInfo__OGMB_1_df'][all_farmers['OGInfo__OGMB_1_df']['date_contracted']=='2147-12-06' | all_farmers['OGInfo__OGMB_1_df']['og_nr']=='']

In [2860]:
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'][0]

datetime.date(2023, 2, 24)

#### OG_Info__OGOrg_1_df

In [2861]:
all_farmers['OG_Info__OGOrg_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        1403 non-null   int64 
 1   og_nr                     1403 non-null   int64 
 2   og_code                   1403 non-null   object
 3   production_unit           1403 non-null   object
 4   area                      1403 non-null   object
 5   ward_nr/name              1403 non-null   object
 6   species                   1403 non-null   object
 7   surname                   1403 non-null   object
 8   first_names               1403 non-null   object
 9   sex                       1403 non-null   object
 10  id_number                 1310 non-null   object
 11  date_of_birth             930 non-null    object
 12  date_contracted           1403 non-null   object
 13  date_dropped_out          405 non-null    object
 14  reason_dropped_out      

In [2862]:
# converted date_of_birth
all_farmers['OG_Info__OGOrg_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_of_birth'])

Converting 'date_contracted' to datetime

In [2863]:
# all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

# Out of bounds nanosecond timestamp: 218-08-19 00:00:00 present at position 59

# all_farmers['OG_Info__OGOrg_1_df'].iloc[59,]
# all_farmers['OG_Info__OGOrg_1_df'].loc[all_farmers['OG_Info__OGOrg_1_df']['date_contracted']=='218-08-19']

# Did not found 218-08-19 at index 59 but the seems to get corrected in the next steps

In [2864]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[59]['date_contracted']

datetime.date(2012, 12, 3)

In [2865]:
all_farmers['OG_Info__OGOrg_1_df'].replace('218-08-19', '2018-08-19', inplace=True)


In [2866]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[50:60,12]

50    2023-02-24
51    2012-12-06
52    2012-12-06
53    2012-12-06
54    2012-12-06
55    2012-12-06
56    2012-12-03
57    2012-12-03
58    2013-08-08
59    2012-12-03
Name: date_contracted, dtype: object

Solving date_contracted errors with padding strings

In [2867]:
# Function to pad/fill to short dates like 5/3/18 to 05/03/2018

def str_date_pad(val):
    
    if type(val) is str and len(val) < 10: 

        date_splitted = val.split('/')
        
        if len(date_splitted[1]) < 2 : 
            date_splitted[1] = '0' + date_splitted[1]

        if (len(date_splitted[2]) < 4) and (int(date_splitted[2]) < 24) : 
            date_splitted[2] = '20' + date_splitted[2]
        if len(date_splitted[2]) < 4 and int(date_splitted[2]) > 23 : 
            date_splitted[2] = '19' + date_splitted[2]
    
        val = '/'.join(date_splitted)
        #print(val)
        return val
    
    else:
        return val


In [2868]:
# Apply function to df
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = all_farmers['OG_Info__OGOrg_1_df']['date_contracted'].apply(str_date_pad)

In [2869]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[50:60,12]

50    2023-02-24
51    2012-12-06
52    2012-12-06
53    2012-12-06
54    2012-12-06
55    2012-12-06
56    2012-12-03
57    2012-12-03
58    2013-08-08
59    2012-12-03
Name: date_contracted, dtype: object

In [2870]:
# Now after padding convert str to datetime gives still problem !
# all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

In [2871]:
# Problem with date '19/08/19218' at 54 BUT its in position/row_index 741
all_farmers['OG_Info__OGOrg_1_df'].loc[all_farmers['OG_Info__OGOrg_1_df']['date_contracted']=='19/08/19218']
# cause year '218' padded to '19218'

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,organic_status,address,organic_status_from_date,fairtrade,ward_nr


In [2872]:
# Solve through replacing '19/08/19218' by '19/08/2018' and dropping '01/00/2022'
all_farmers['OG_Info__OGOrg_1_df'].replace('19/08/19218', '19/08/2018', inplace=True)
all_farmers['OG_Info__OGOrg_1_df'].replace('01/00/2022', None, inplace=True)

In [2873]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[741]['date_contracted']

datetime.date(2018, 8, 10)

In [2874]:
#[16122.0, 16094.0, 16086.0, 16069.0]

In [2875]:
all_farmers['OG_Info__OGOrg_1_df'][all_farmers['OG_Info__OGOrg_1_df']['og_nr']==16122.0]

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,organic_status,address,organic_status_from_date,fairtrade,ward_nr
491,1228,16122,MAOG,Manicaland,Makoni,23,Paprika,Kanjanda,Prisca,Female,42-215210-S42,NaT,2017-08-31,2017-12-04,Not externally inspected,Dropout,"Dope Secondary School Box 8060, Rusape",2017-08-31,False,23


In [2876]:
# Ready to convert str to datetime
all_farmers['OG_Info__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

In [2877]:
all_farmers['OG_Info__OGOrg_1_df'][all_farmers['OG_Info__OGOrg_1_df']['og_nr']==16122.0]

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,organic_status,address,organic_status_from_date,fairtrade,ward_nr
491,1228,16122,MAOG,Manicaland,Makoni,23,Paprika,Kanjanda,Prisca,Female,42-215210-S42,NaT,2017-08-31,2017-12-04,Not externally inspected,Dropout,"Dope Secondary School Box 8060, Rusape",2017-08-31,False,23


In [2878]:
all_farmers['OG_Info__OGOrg_1_df'].iloc[65]['date_contracted']

Timestamp('2012-12-06 00:00:00')

In [2879]:
all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_dropped_out'])

In [2880]:
all_farmers['OG_Info__OGOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        1403 non-null   int64         
 1   og_nr                     1403 non-null   int64         
 2   og_code                   1403 non-null   object        
 3   production_unit           1403 non-null   object        
 4   area                      1403 non-null   object        
 5   ward_nr/name              1403 non-null   object        
 6   species                   1403 non-null   object        
 7   surname                   1403 non-null   object        
 8   first_names               1403 non-null   object        
 9   sex                       1403 non-null   object        
 10  id_number                 1310 non-null   object        
 11  date_of_birth             930 non-null    datetime64[ns]
 12  date_contracted     

#### wc_info_bayoba_1_df

In [2881]:
all_farmers['wc_info_bayoba_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10952 entries, 0 to 10951
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        10952 non-null  int64  
 1   wc_nr                     10952 non-null  int64  
 2   wc_code                   10952 non-null  object 
 3   production_unit           10952 non-null  object 
 4   area                      10952 non-null  object 
 5   ward_nr/name              10535 non-null  object 
 6   species                   10952 non-null  object 
 7   surname                   10951 non-null  object 
 8   first_names               10949 non-null  object 
 9   sex                       10952 non-null  object 
 10  id_number                 10248 non-null  object 
 11  date_contracted           10951 non-null  object 
 12  date_dropped_out          1924 non-null   object 
 13  reason_dropped_out        1054 non-null   object 
 14  addres

In [2882]:
all_farmers['wc_info_bayoba_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_of_birth'])

In [2883]:
all_farmers['wc_info_bayoba_1_df']['date_contracted'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_contracted'])

In [2884]:
all_farmers['wc_info_bayoba_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info_bayoba_1_df']['date_dropped_out'])

In [2885]:
all_farmers['wc_info_bayoba_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10952 entries, 0 to 10951
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        10952 non-null  int64         
 1   wc_nr                     10952 non-null  int64         
 2   wc_code                   10952 non-null  object        
 3   production_unit           10952 non-null  object        
 4   area                      10952 non-null  object        
 5   ward_nr/name              10535 non-null  object        
 6   species                   10952 non-null  object        
 7   surname                   10951 non-null  object        
 8   first_names               10949 non-null  object        
 9   sex                       10952 non-null  object        
 10  id_number                 10248 non-null  object        
 11  date_contracted           10951 non-null  datetime64[ns]
 12  date_dropped_out  

#### wc_info__WCOrg_1_df

In [2886]:
all_farmers['wc_info__WCOrg_1_df'].info()

# has no date_of_birth column -> 19 columns
# need to convert date_contracted and date_dropped_out

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        565 non-null    int64  
 1   wc_nr                     565 non-null    int64  
 2   wc_code                   565 non-null    object 
 3   production_unit           565 non-null    object 
 4   area                      565 non-null    object 
 5   ward_nr/name              563 non-null    object 
 6   species                   565 non-null    object 
 7   surname                   564 non-null    object 
 8   first_names               564 non-null    object 
 9   sex                       565 non-null    object 
 10  id_number                 472 non-null    object 
 11  date_contracted           558 non-null    object 
 12  reason_dropped_out        142 non-null    object 
 13  address                   530 non-null    object 
 14  organic_st

In [2887]:
all_farmers['wc_info__WCOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_contracted'])

In [2888]:
all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'])

  all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['wc_info__WCOrg_1_df']['date_dropped_out'])


In [2889]:
all_farmers['wc_info__WCOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        565 non-null    int64         
 1   wc_nr                     565 non-null    int64         
 2   wc_code                   565 non-null    object        
 3   production_unit           565 non-null    object        
 4   area                      565 non-null    object        
 5   ward_nr/name              563 non-null    object        
 6   species                   565 non-null    object        
 7   surname                   564 non-null    object        
 8   first_names               564 non-null    object        
 9   sex                       565 non-null    object        
 10  id_number                 472 non-null    object        
 11  date_contracted           558 non-null    datetime64[ns]
 12  reason_dropped_out    

#### OG_Info_reserve__OGOrg_1_df

In [2890]:
all_farmers['OG_Info_reserve__OGOrg_1_df'][all_farmers['OG_Info_reserve__OGOrg_1_df']['og_nr']==5157.0]

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr
209,306,5157,BNOG,Matabeleland North,Binga,15 Kariangwe,Rosella,Mungombe,Sibongile,female,06-033379V06,,17/08/2015,2017-12-04,Not externally inspected,Manseme School Binga,Dropout,2015-11-17,True,15.0


In [2891]:
# all_farmers['OG_Info_reserve__OGOrg_1_df'].info()

# need to convert date_of_birth, date_contracted and date_dropped_out

In [2892]:
#all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'], format='%d/%m/%Y')

In [2893]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['date_of_birth'])

In [2894]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_contracted'] = pd.to_datetime(all_farmers['OG_Info__OGOrg_1_df']['date_contracted'])

In [2895]:
all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out'] = pd.to_datetime(all_farmers['OG_Info_reserve__OGOrg_1_df']['date_dropped_out'])

In [2896]:
# all_farmers['OG_Info_reserve__OGOrg_1_df'].info()

In [2897]:

all_farmers['OG_Info_reserve__OGOrg_1_df'][all_farmers['OG_Info_reserve__OGOrg_1_df']['og_nr']==5157.0]

Unnamed: 0,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_of_birth,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr
209,306,5157,BNOG,Matabeleland North,Binga,15 Kariangwe,Rosella,Mungombe,Sibongile,female,06-033379V06,NaT,2015-08-04,2017-12-04,Not externally inspected,Manseme School Binga,Dropout,2015-11-17,True,15.0


### Adding Birthdates to wc_info_WCOrg_1_df

In [2898]:
all_birthdates_df = sf.get_dataframe(f'SELECT * FROM {schema}."all_birthdates"')

In [2899]:
len(list(set(all_birthdates_df["wc_nr"]) & set(all_farmers['wc_info__WCOrg_1_df'][['wc_nr']].squeeze())))

453

#### Merge all_birthdates_df to wc_info__WCOrg_1_df

In [2900]:
all_farmers['wc_info__WCOrg_1_df'] = pd.merge(all_farmers['wc_info__WCOrg_1_df'], all_birthdates_df, how="left", on="wc_nr")

In [2901]:
all_farmers['wc_info__WCOrg_1_df']

Unnamed: 0,id,wc_nr,wc_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,id_number,date_contracted,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,date_dropped_out,ward_nr,date_of_birth
0,4942,3962,HWWC,Matabeleland North,Hwange,6 Gondwa,Devils Claw,Moyo,Pios,male,,2014-07-07,,Secretariat Sch Vic Falls,Org,,False,NaT,6.0,NaT
1,5608,3692,HWWC,Matabeleland North,Hwange,6 Gondwa,Devils Claw,Ncube,Mwayani,male,79-124391X79,2015-03-14,,"Mironga School, Vic Falls",Org,,False,NaT,6.0,NaT
2,10630,1773,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Ncube,Engel,female,08-161126N39,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-11-11,False,2017-11-21,3.0,NaT
3,1763,1772,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Dube,Samson,Male,79-031011X79,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-06-19,False,NaT,3.0,1957-10-02
4,1766,1775,HWWC,Matabeleland North,Hwange,3 Kachechete,Devils Claw,Ngwenya,Joshua,Male,79-011290L79,2012-10-06,,"Bethesda Sch, Vic-Falls",Org,2017-06-19,False,NaT,3.0,1955-10-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,11146,1057,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Zaranyika,Vaina,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT
561,11147,21040,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Chibawana,Violet,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT
562,11148,2555,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu kola,Hofisi,Christine,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,,False,2021-01-29,21.0,NaT
563,11149,2066,CMWC,Manicaland,Chimanimani Rusitu,21,Wild Gotu Kola,Mudada,Marjorie,female,,NaT,,"Hode Pry Sch, Bag 2051, Chimanimani",Reinstated,2019-03-09,False,2021-01-29,21.0,NaT


In [2902]:
all_farmers['wc_info__WCOrg_1_df'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 565 entries, 0 to 564
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        565 non-null    int64         
 1   wc_nr                     565 non-null    int64         
 2   wc_code                   565 non-null    object        
 3   production_unit           565 non-null    object        
 4   area                      565 non-null    object        
 5   ward_nr/name              563 non-null    object        
 6   species                   565 non-null    object        
 7   surname                   564 non-null    object        
 8   first_names               564 non-null    object        
 9   sex                       565 non-null    object        
 10  id_number                 472 non-null    object        
 11  date_contracted           558 non-null    datetime64[ns]
 12  reason_dropped_out    

## Final merge of farmer dfs to concated_farmers

Verify shapes of dfs

In [2903]:
for name, df in all_farmers.items():
    print(name)
    print(df.shape)

OGInfo__OGMB_1_df
(8166, 19)
OG_Info__OGOrg_1_df
(1403, 20)
wc_info_bayoba_1_df
(10952, 20)
wc_info__WCOrg_1_df
(565, 20)
OG_Info_reserve__OGOrg_1_df
(795, 20)


In [2904]:
# ['ward_nr'] column still missing:

all_farmers['OGInfo__OGMB_1_df'][['ward_nr']] = all_farmers['OGInfo__OGMB_1_df'][['ward_nr/name']]
all_farmers['OGInfo__OGMB_1_df'].shape

(8166, 20)

In [2905]:
concated_farmers = pd.concat(all_farmers)

In [2906]:
concated_farmers.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 21881 entries, ('OGInfo__OGMB_1_df', 0) to ('OG_Info_reserve__OGOrg_1_df', 794)
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        21881 non-null  int64         
 1   og_nr                     10364 non-null  float64       
 2   og_code                   10364 non-null  object        
 3   production_unit           21881 non-null  object        
 4   area                      21881 non-null  object        
 5   ward_nr/name              21461 non-null  object        
 6   species                   21881 non-null  object        
 7   surname                   21879 non-null  object        
 8   first_names               21877 non-null  object        
 9   sex                       21881 non-null  object        
 10  id_number                 19837 non-null  object        
 11  date_of_birth             

In [2907]:
concated_farmers[concated_farmers['date_contracted'].dt.year.astype('Int64')>2023]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,145,1517,13107.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Mutape,Sophia,female,...,2174-12-02,NaT,,"Kasawo sch, Box 346, Mbire D",Mabagrown,2017-12-02 00:00:00.000,False,6.0,,
OGInfo__OGMB_1_df,239,1414,13190.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Gurupira,Vengayi,male,...,2147-12-06,2020-03-19,Own choice,"Nyarutomno sch, Box 500, Mbire D",Dropout,2017-12-06 00:00:00.000,False,6.0,,


In [2908]:
concated_farmers.replace('2174-12-02', '2021-07-06', inplace=True)
concated_farmers.replace('2147-12-06', '2021-07-06', inplace=True)

In [2909]:
concated_farmers[concated_farmers['og_nr']==13190.0]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,239,1414,13190.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Gurupira,Vengayi,male,...,2021-07-06,2020-03-19,Own choice,"Nyarutomno sch, Box 500, Mbire D",Dropout,2017-12-06 00:00:00.000,False,6.0,,


In [2910]:
concated_farmers

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,0,1545,13137.0,MBOGR,Mashonaland Central,Mbire,15,Rosella,Nyamayaro,Phillip,male,...,NaT,2020-02-05,Own choice,"Mahuwe sch, Box 92, Mbire D",Dropout,2017-12-02 00:00:00.000,False,15.0,,
OGInfo__OGMB_1_df,1,12026,12116.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Funda,Jennie,female,...,2022-04-02,NaT,,"Kasuwo Primary School 346, Mbire",Mabagrown,2022-04-02 00:00:00.000,False,6.0,,
OGInfo__OGMB_1_df,2,9914,23949.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Chingwena,Gladys,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
OGInfo__OGMB_1_df,3,9915,23950.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Marumbe,Tariro,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
OGInfo__OGMB_1_df,4,9916,23951.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Dambudzo,Ketai,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OG_Info_reserve__OGOrg_1_df,790,3549,20026.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Nhete,Stella,female,...,2022-06-11,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,791,3550,20027.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Musiiwa,Admire,male,...,2022-06-11,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,792,3551,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Masodzi,Norah,female,...,2022-06-11,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,
OG_Info_reserve__OGOrg_1_df,793,1861,14001.0,BNOG,Matabeleland North,Binga,11 Lubu,Rosella,Mwembe,Mutale,female,...,2022-06-11,NaT,,Lubu primary School P Bag 5729 Binga,Org,2018-08-31,False,11.0,,


In [2911]:
# 11517 wc_nr
concated_farmers[~concated_farmers['wc_nr'].isna()]['wc_nr']

wc_info_bayoba_1_df  0      21146.0
                     1      21833.0
                     2      21147.0
                     3      22986.0
                     4      21148.0
                             ...   
wc_info__WCOrg_1_df  560     1057.0
                     561    21040.0
                     562     2555.0
                     563     2066.0
                     564     1069.0
Name: wc_nr, Length: 11517, dtype: float64

# Clean and analyze concated_farmers

1. Drop duplicates (+ wc_nr and og_nr)
2. Count number of records by category
3. Clean column by column

In [2912]:
# Looking for Duplicates
concated_farmers.duplicated().value_counts()

# 73 duplicates

False    21832
True        49
dtype: int64

In [2913]:
# All duplicates in OG_Info_reserve__OGOrg_1_df
df_dup = concated_farmers[concated_farmers.duplicated()].sort_values('id')
df_dup

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OG_Info_reserve__OGOrg_1_df,43,47,2869.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Muchimba,Magret,female,...,2012-12-06,NaT,,Nsenga Primary School Box 46 Binga,Org,2016-10-25,True,5.0,,
OG_Info_reserve__OGOrg_1_df,53,57,2896.0,BNOG,Matabeleland North,Binga,5 Sinakoma,Rosella,Munkuli,Betty,female,...,2012-12-06,NaT,,Nsenga Primary School Box 46 Binga,Org,2016-10-25,True,5.0,,
OG_Info_reserve__OGOrg_1_df,189,133,3788.0,BNOG,Matabeleland North,Binga,15 Kariangwe,Rosella,Mumpande,Julia,female,...,2015-08-17,NaT,,Manseme School Binga,Org,2016-10-25,True,15.0,,
OG_Info_reserve__OGOrg_1_df,266,275,12265.0,MAOG,Manicaland,Makoni,17,"Chilli, Paprika",Mukoyi,James,male,...,2012-08-10,2018-10-18,Not inspected,Chirimutsitu School Bag 8147 Rusape,Dropout,2016-03-30,True,17.0,,
OG_Info_reserve__OGOrg_1_df,206,315,5153.0,BNOG,Matabeleland North,Binga,15 Kariangwe,Rosella,Mwembe,Esnath,female,...,2015-08-17,NaT,,Manseme School Binga,Org,2015-11-17,True,15.0,,
OG_Info_reserve__OGOrg_1_df,580,1041,16270.0,MAOG,Manicaland,Makoni,23,Paprika,Kabanda,Loveness,Female,...,2017-08-31,2017-12-04,Not externally inspected,"Dope Secondary School Box 8060, Rusape",Dropout,2017-08-31,False,23.0,,
OG_Info_reserve__OGOrg_1_df,582,1046,16272.0,MAOG,Manicaland,Makoni,23,Paprika,Kamusasa,Susan,Female,...,2017-08-31,2017-12-04,Not externally inspected,"Dope Secondary School Box 8060, Rusape",Dropout,2017-08-31,False,23.0,,
OG_Info_reserve__OGOrg_1_df,447,1055,16130.0,MAOG,Manicaland,Makoni,23,Paprika,Mabhutu,Gift,male,...,2017-08-31,2017-12-04,Not externally inspected,"Dope Secondary School Box 8060, Rusape",Dropout,2017-08-31,False,23.0,,
OG_Info_reserve__OGOrg_1_df,454,1063,16138.0,MAOG,Manicaland,Makoni,21,Chillie,Magondwa,Patience,Female,...,2017-08-31,2017-10-19,High risk,"Chirimutsitu Primary P Bag 8147, Rusape",Dropout,2017-10-19,False,21.0,,
OG_Info_reserve__OGOrg_1_df,462,1071,16146.0,MAOG,Manicaland,Makoni,21,Chillie,Makonde,Esther,Female,...,2017-08-31,2017-12-04,Not externally inspected,"Chirimutsitu Primary P Bag 8147, Rusape",Dropout,2017-10-25,False,21.0,,


In [2914]:
ls_dup_og_nr= list(df_dup['og_nr'])

In [2915]:
for ognr in ls_dup_og_nr:
    print(concated_farmers[concated_farmers['og_nr'] == ognr])

# duplicates trough OG_Info__OGOrg_1_df und OG_Info_reserve__OGOrg_1_df -> DROP duplicates in allfarmers

                                id   og_nr og_code     production_unit   area  \
OG_Info__OGOrg_1_df         54  47  2869.0    BNOG  Matabeleland North  Binga   
OG_Info_reserve__OGOrg_1_df 43  47  2869.0    BNOG  Matabeleland North  Binga   

                               ward_nr/name  species   surname first_names  \
OG_Info__OGOrg_1_df         54   5 Sinakoma  Rosella  Muchimba      Magret   
OG_Info_reserve__OGOrg_1_df 43   5 Sinakoma  Rosella  Muchimba      Magret   

                                   sex  ... date_contracted date_dropped_out  \
OG_Info__OGOrg_1_df         54  female  ...      2012-12-06              NaT   
OG_Info_reserve__OGOrg_1_df 43  female  ...      2012-12-06              NaT   

                               reason_dropped_out  \
OG_Info__OGOrg_1_df         54               None   
OG_Info_reserve__OGOrg_1_df 43               None   

                                                           address  \
OG_Info__OGOrg_1_df         54  Nsenga Primary Sch

In [2916]:
# Droping duplicates
concated_farmers.drop_duplicates(subset=None, keep='first', inplace=True)

In [2917]:
concated_farmers.duplicated().value_counts()

False    21832
dtype: int64

### Dropping duplicates of wc_nr

In [2918]:
# 11517 wc_nr
wc_nr_df = concated_farmers[~concated_farmers['wc_nr'].isna()][['wc_nr']]
wc_nr_df

Unnamed: 0,Unnamed: 1,wc_nr
wc_info_bayoba_1_df,0,21146.0
wc_info_bayoba_1_df,1,21833.0
wc_info_bayoba_1_df,2,21147.0
wc_info_bayoba_1_df,3,22986.0
wc_info_bayoba_1_df,4,21148.0
...,...,...
wc_info__WCOrg_1_df,560,1057.0
wc_info__WCOrg_1_df,561,21040.0
wc_info__WCOrg_1_df,562,2555.0
wc_info__WCOrg_1_df,563,2066.0


#### wc_nr is unique!

In [2919]:
wc_nr_df.squeeze().is_unique

True

In [2920]:
# 21868 rows have og_nr or wc_nr!
concated_farmers[~concated_farmers['og_nr'].isna() | ~concated_farmers['wc_nr'].isna()].shape

(21832, 22)

In [2921]:
wc_df = concated_farmers[~concated_farmers['wc_nr'].isna()]

In [2922]:
wc_df.shape

(11517, 22)

### Dropping duplicates of og_nr

In [2923]:
# 10351 
og_nr_df = concated_farmers[~concated_farmers['og_nr'].isna()][['og_nr', 'date_contracted']]
og_nr_df

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


Unnamed: 0,Unnamed: 1,og_nr,date_contracted
OGInfo__OGMB_1_df,0,13137.0,NaT
OGInfo__OGMB_1_df,1,12116.0,2022-04-02
OGInfo__OGMB_1_df,2,23949.0,2020-03-23
OGInfo__OGMB_1_df,3,23950.0,2020-03-23
OGInfo__OGMB_1_df,4,23951.0,2020-03-23
...,...,...,...
OG_Info_reserve__OGOrg_1_df,790,20026.0,2022-06-11
OG_Info_reserve__OGOrg_1_df,791,20027.0,2022-06-11
OG_Info_reserve__OGOrg_1_df,792,20028.0,2022-06-11
OG_Info_reserve__OGOrg_1_df,793,14001.0,2022-06-11


In [2924]:
# og_nr NOT unique
og_nr_df['og_nr'].squeeze().is_unique

False

In [2925]:
concated_farmers[~concated_farmers['og_nr'].isna()]['og_nr'].duplicated(keep='first').value_counts()

False    9093
True     1222
Name: og_nr, dtype: int64

In [2926]:
concated_farmers['og_nr'].duplicated(keep='first').value_counts()

True     12738
False     9094
Name: og_nr, dtype: int64

In [2927]:
concated_farmers[~concated_farmers['og_nr'].isna()].shape

(10315, 22)

In [2928]:
concated_farmers.sort_values('date_contracted', ascending=False, inplace=True)

In [2929]:
concated_farmers[concated_farmers['og_nr'] == 20028.0][['og_nr', 'date_contracted']]

Unnamed: 0,Unnamed: 1,og_nr,date_contracted
OG_Info_reserve__OGOrg_1_df,792,20028.0,2022-06-11
OGInfo__OGMB_1_df,5506,20028.0,2019-02-01
OG_Info__OGOrg_1_df,958,20028.0,2019-02-01


In [2930]:
#concated_farmers[concated_farmers['og_nr'].isna()] 

In [2931]:
df = concated_farmers.copy()
df.shape

(21832, 22)

In [2932]:
df = df[~df['og_nr'].isna()]

In [2933]:
df.shape

(10315, 22)

In [2934]:
df = df[~df['og_nr'].isna()].drop_duplicates('og_nr', keep='first')

In [2935]:
df.shape

(9093, 22)

In [2936]:
21868-(10351-9093)

20610

In [2937]:
df[df['og_nr'] == 20028.0]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OG_Info_reserve__OGOrg_1_df,792,3551,20028.0,MUOG,Mashonaland Central,Mushumbi,3,"Rosella, Chilli, Paprika",Masodzi,Norah,female,...,2022-06-11,NaT,,"ARDA Mushumbi, Box 210, Mbire",New,2019-02-01,False,3.0,,


#### Now og_nr is unique!

In [2938]:
og_not_na_df = df[~df['og_nr'].isna()]
og_not_na_df['og_nr'].squeeze().is_unique

True

In [2939]:
wc_og_df = pd.concat([df, wc_df])
wc_og_df.shape

(20610, 22)

In [2940]:
concated_farmers = wc_og_df.copy()

In [2941]:
concated_farmers[concated_farmers['og_nr'] == 20028.0][['og_nr', 'date_contracted']]

Unnamed: 0,Unnamed: 1,og_nr,date_contracted
OG_Info_reserve__OGOrg_1_df,792,20028.0,2022-06-11


In [2942]:
concated_farmers['og_nr'].duplicated().value_counts()

True     11516
False     9094
Name: og_nr, dtype: int64

In [2943]:
concated_farmers.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 20610 entries, ('OG_Info__OGOrg_1_df', 1273) to ('wc_info__WCOrg_1_df', 564)
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        20610 non-null  int64         
 1   og_nr                     9093 non-null   float64       
 2   og_code                   9093 non-null   object        
 3   production_unit           20610 non-null  object        
 4   area                      20610 non-null  object        
 5   ward_nr/name              20190 non-null  object        
 6   species                   20610 non-null  object        
 7   surname                   20608 non-null  object        
 8   first_names               20606 non-null  object        
 9   sex                       20610 non-null  object        
 10  id_number                 18658 non-null  object        
 11  date_of_birth             255

## Adding column contract duration

In [2944]:
# Sorting back with index
concated_farmers.sort_index()

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_contracted,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code
OGInfo__OGMB_1_df,0,1545,13137.0,MBOGR,Mashonaland Central,Mbire,15,Rosella,Nyamayaro,Phillip,male,...,NaT,2020-02-05,Own choice,"Mahuwe sch, Box 92, Mbire D",Dropout,2017-12-02 00:00:00.000,False,15.0,,
OGInfo__OGMB_1_df,1,12026,12116.0,MBOGR,Mashonaland Central,Mbire,6,Rosella,Funda,Jennie,female,...,2022-04-02,NaT,,"Kasuwo Primary School 346, Mbire",Mabagrown,2022-04-02 00:00:00.000,False,6.0,,
OGInfo__OGMB_1_df,2,9914,23949.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Chingwena,Gladys,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
OGInfo__OGMB_1_df,3,9915,23950.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Marumbe,Tariro,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
OGInfo__OGMB_1_df,4,9916,23951.0,MBOGK,Mashonaland Central,Mbire,13,Rosella,Dambudzo,Ketai,female,...,2020-03-23,2021-07-26,Area closed,Sapa Primary School P O Box 203,Dropout,2020-03-23 00:00:00.000,False,13.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wc_info_bayoba_1_df,10947,1131,,,South Eastern region,Chimanimani,5,Baobab,Mugebe,Tradder,male,...,2012-06-26,NaT,,Chibuwe,Org,,False,5.0,21141.0,CMWC
wc_info_bayoba_1_df,10948,1134,,,South Eastern region,Chimanimani,5,Baobab,Chibuwe,Silvia,female,...,2012-06-26,NaT,,Chibuwe,Org,,False,5.0,21144.0,CMWC
wc_info_bayoba_1_df,10949,32782,,,Northern Eastern region,Mt Darwin,,Baobab + ximenia,Nyatito,Lucia,female,...,2022-08-30,NaT,,"Gungwe Primary School Bag 338, Mt Darwin",New,2022-08-30,False,,16499.0,MTWC
wc_info_bayoba_1_df,10950,32783,,,Northern Eastern region,Mt Darwin,,Baobab + ximenia,Kanocheringa,Abgal,female,...,2022-08-30,NaT,,"Gungwe Primary School Bag 338, Mt Darwin",New,2022-08-30,False,,16500.0,MTWC


In [2945]:
# 9289 row with date_contracted and date_dropped_out
concated_farmers[~concated_farmers['date_contracted'].isna() & ~concated_farmers['date_dropped_out'].isna()][['date_contracted','date_dropped_out']]

Unnamed: 0,Unnamed: 1,date_contracted,date_dropped_out
OG_Info_reserve__OGOrg_1_df,14,2023-02-24,2019-01-09
OG_Info_reserve__OGOrg_1_df,7,2023-02-24,2018-11-15
OG_Info_reserve__OGOrg_1_df,533,2023-02-24,2018-01-15
OG_Info_reserve__OGOrg_1_df,413,2023-02-23,2017-12-04
OG_Info_reserve__OGOrg_1_df,746,2022-06-11,2019-01-09
...,...,...,...
wc_info__WCOrg_1_df,553,2018-10-20,2021-01-29
wc_info__WCOrg_1_df,554,2018-10-20,2021-01-29
wc_info__WCOrg_1_df,555,2018-10-20,2021-01-29
wc_info__WCOrg_1_df,556,2018-10-20,2021-01-29


## Problem with negative duration, 'Cause date_contracted is overwritten with wrong date 

In [2946]:
concated_farmers['contract_duration'] = concated_farmers['date_dropped_out'] - concated_farmers['date_contracted']
concated_farmers[~concated_farmers['contract_duration'].isna()][['og_nr','wc_nr','date_contracted','date_dropped_out','contract_duration']]

Unnamed: 0,Unnamed: 1,og_nr,wc_nr,date_contracted,date_dropped_out,contract_duration
OG_Info_reserve__OGOrg_1_df,14,19490.0,,2023-02-24,2019-01-09,-1507 days
OG_Info_reserve__OGOrg_1_df,7,19485.0,,2023-02-24,2018-11-15,-1562 days
OG_Info_reserve__OGOrg_1_df,533,16220.0,,2023-02-24,2018-01-15,-1866 days
OG_Info_reserve__OGOrg_1_df,413,16089.0,,2023-02-23,2017-12-04,-1907 days
OG_Info_reserve__OGOrg_1_df,746,19807.0,,2022-06-11,2019-01-09,-1249 days
...,...,...,...,...,...,...
wc_info__WCOrg_1_df,553,,4261.0,2018-10-20,2021-01-29,832 days
wc_info__WCOrg_1_df,554,,4262.0,2018-10-20,2021-01-29,832 days
wc_info__WCOrg_1_df,555,,4263.0,2018-10-20,2021-01-29,832 days
wc_info__WCOrg_1_df,556,,4264.0,2018-10-20,2021-01-29,832 days


In [2947]:
import numpy as np

def unique(list1):
    x = np.array(list1)
    print(np.unique(x))


In [2948]:
negative_duration = concated_farmers[concated_farmers['date_contracted'] > concated_farmers['date_dropped_out']][['og_nr','wc_nr','date_contracted','date_dropped_out']]
negative_duration

Unnamed: 0,Unnamed: 1,og_nr,wc_nr,date_contracted,date_dropped_out
OG_Info_reserve__OGOrg_1_df,14,19490.0,,2023-02-24,2019-01-09
OG_Info_reserve__OGOrg_1_df,7,19485.0,,2023-02-24,2018-11-15
OG_Info_reserve__OGOrg_1_df,533,16220.0,,2023-02-24,2018-01-15
OG_Info_reserve__OGOrg_1_df,413,16089.0,,2023-02-23,2017-12-04
OG_Info_reserve__OGOrg_1_df,746,19807.0,,2022-06-11,2019-01-09
OG_Info_reserve__OGOrg_1_df,761,19823.0,,2022-06-10,2019-01-09
OGInfo__OGMB_1_df,239,13190.0,,2021-07-06,2020-03-19
OGInfo__OGMB_1_df,1383,14304.0,,2018-11-15,2002-05-06
OG_Info_reserve__OGOrg_1_df,711,12939.0,,2018-10-05,2017-10-09
OG_Info_reserve__OGOrg_1_df,712,16107.0,,2018-10-05,2018-01-15


In [2949]:
negative_duration[negative_duration['og_nr']==20110.0]

Unnamed: 0,Unnamed: 1,og_nr,wc_nr,date_contracted,date_dropped_out


In [2950]:
table_list = []
for i in range(len(negative_duration.index)):
    table_list.append(negative_duration.index[i][0])
unique(table_list)

['OGInfo__OGMB_1_df' 'OG_Info__OGOrg_1_df' 'OG_Info_reserve__OGOrg_1_df'
 'wc_info__WCOrg_1_df']


In [2951]:
concated_farmers['contract_duration'].mean()

Timedelta('622 days 08:52:40.370450136')

In [2952]:
concated_farmers[concated_farmers['contract_duration']< '0 days'][['og_nr','wc_nr','date_contracted','date_dropped_out','contract_duration']]

Unnamed: 0,Unnamed: 1,og_nr,wc_nr,date_contracted,date_dropped_out,contract_duration
OG_Info_reserve__OGOrg_1_df,14,19490.0,,2023-02-24,2019-01-09,-1507 days
OG_Info_reserve__OGOrg_1_df,7,19485.0,,2023-02-24,2018-11-15,-1562 days
OG_Info_reserve__OGOrg_1_df,533,16220.0,,2023-02-24,2018-01-15,-1866 days
OG_Info_reserve__OGOrg_1_df,413,16089.0,,2023-02-23,2017-12-04,-1907 days
OG_Info_reserve__OGOrg_1_df,746,19807.0,,2022-06-11,2019-01-09,-1249 days
OG_Info_reserve__OGOrg_1_df,761,19823.0,,2022-06-10,2019-01-09,-1248 days
OGInfo__OGMB_1_df,239,13190.0,,2021-07-06,2020-03-19,-474 days
OGInfo__OGMB_1_df,1383,14304.0,,2018-11-15,2002-05-06,-6037 days
OG_Info_reserve__OGOrg_1_df,711,12939.0,,2018-10-05,2017-10-09,-361 days
OG_Info_reserve__OGOrg_1_df,712,16107.0,,2018-10-05,2018-01-15,-263 days


### Further Cleaning

In [2953]:
concated_farmers['organic_status'].value_counts()

Org             9526
Dropout         6944
New             2712
Mabagrown       1047
Reinstated       190
Uncertified      149
dropout           33
Under review       8
Name: organic_status, dtype: int64

In [2954]:
concated_farmers[concated_farmers['id'].isna()]

Unnamed: 0,Unnamed: 1,id,og_nr,og_code,production_unit,area,ward_nr/name,species,surname,first_names,sex,...,date_dropped_out,reason_dropped_out,address,organic_status,organic_status_from_date,fairtrade,ward_nr,wc_nr,wc_code,contract_duration


In [2955]:
df = concated_farmers[~concated_farmers['id_number'].isna()][['id_number']]
df.squeeze().is_unique

False

In [2956]:
concated_farmers['area'] = concated_farmers['area'].str.title()

In [2957]:
concated_farmers['area'].replace('Mt Darwin ','Mt Darwin', inplace=True)
concated_farmers['area'].replace('Beitbridge ','Beitbridge', inplace=True)
concated_farmers['area'].replace('Hwange ','Hwange', inplace=True)
concated_farmers['area'].replace('Makoni Mt Shalom','Makoni', inplace=True)
concated_farmers['area'].replace('Chimanimani Rusitu','Chimanimani', inplace=True)
concated_farmers['area'].replace('Chimanimani Tilbury','Chimanimani', inplace=True)

In [2958]:
concated_farmers['area'].value_counts()

Mbire                   7683
Binga                   2197
Rushinga                1750
Chimanimani             1660
Mudzi                   1590
Buhera                  1152
Chipinge                 977
Mt Darwin                968
Beitbridge               613
Makoni                   297
Hwange                   289
Kwekwe                   270
Uzumbamarambapfungwe     257
Mwenezi                  173
Karoi                    163
Triangle                 163
Mutoko                   126
Mberengwa                115
Chivi                     94
Matobo                    44
Mushumbi                  29
Name: area, dtype: int64

In [2959]:
concated_farmers['area'].unique()
# Correct 'chipinge' first upper

array(['Binga', 'Mushumbi', 'Makoni', 'Mbire', 'Triangle', 'Karoi',
       'Chimanimani', 'Chipinge', 'Chivi', 'Buhera', 'Mt Darwin',
       'Beitbridge', 'Mudzi', 'Rushinga', 'Kwekwe', 'Mwenezi',
       'Uzumbamarambapfungwe', 'Mutoko', 'Mberengwa', 'Matobo', 'Hwange'],
      dtype=object)

In [2960]:
concated_farmers['production_unit'] = concated_farmers['production_unit'].replace('Mashonaland Centra','Mashonaland Central')

In [2961]:
concated_farmers['production_unit'].value_counts()

Mashonaland Central        7711
Northern Eastern region    4691
South Eastern region       3498
Matabeleland North         2487
Southern region             995
Manicaland                  588
Midlands                    270
Masvingo                    163
Mashonaland West            163
Matabeleland South           44
Name: production_unit, dtype: int64

In [2962]:
concated_farmers['ward_nr'].unique()

# correct to int

array([11.,  5.,  3., 18.,  7., 15.,  2., 19., 10., 16.,  8.,  9.,  6.,
       17., 12.,  4., 13., 14., nan, 23., 21., 27.,  1., 25., 20., 29.,
       33., 24., 28., 30., 31., 34., 32., 22.])

In [2963]:
# 'id_number has not unique and has 3254 duplicates'
#concated_farmers['id_number'].is_unique
concated_farmers['id_number'].duplicated().value_counts()

False    18205
True      2405
Name: id_number, dtype: int64

#### Setting values of species, sex, reason_droppen_out and organic_status to lower case

In [2964]:
concated_farmers['species'] = concated_farmers['species'].str.lower()
concated_farmers['sex'] = concated_farmers['sex'].str.lower()

concated_farmers['reason_dropped_out'] = concated_farmers['reason_dropped_out'].str.lower()
concated_farmers['organic_status'] = concated_farmers['organic_status'].str.lower()

In [2965]:
concated_farmers['organic_status'].unique()

array(['new', 'org', 'dropout', 'under review', 'mabagrown', 'reinstated',
       'uncertified', None], dtype=object)

In [2966]:
concated_farmers['species'].unique()

# correct to types rosella, chili, paprika,  

# '5 sinakoma', 'robert' correct to None

array(['rosella', '5 sinakoma', 'rosella, chilli, paprika', 'paprika',
       'chilli, paprika', 'rosell', 'ronald', 'roseela',
       'paprika, chilli', 'chillie', 'chilli', 'chilli,paprika', 'chili',
       'chillie/paprica ', 'rosella, strophantus', 'baobab', 'marula',
       'baobab + ximenia', 'trichillia', 'baobab+kms+ximenia',
       'kalahari melon seed', 'ximenia caffra', 'kms + ximenia',
       'ximenia americana', 'devils claw', 'wild gotu kola'], dtype=object)

In [2967]:
concated_farmers['species'] = concated_farmers['species'].replace({'5 sinakoma': 'None', 'ronald':  'None','roseela': 'rosella', 'rosell': 'rosella', 
                                                                   'paprika, chilli': 'chilli, paprika', 'chili': 'chilli',
                                                                    'chilli,paprika':'chilli, paprika', 'chillie': 'chilli', 
                                                                    'chillie/paprica ': 'chilli, paprika',
                                                                    'baobab + ximenia': 'baobab, ximenia', 'baobab+kms+ximenia': 'baobab, kms, ximenia',
                                                                    'kalahari melon seed': 'kalahari, melon, seed', 'kms + ximenia': 'kms, ximenia'
                                                                    })

In [2968]:
concated_farmers['wc_nr'].duplicated().value_counts()

False    11518
True      9092
Name: wc_nr, dtype: int64

In [2969]:
# 11517 wc_nr
concated_farmers[~concated_farmers['wc_nr'].isna()]['wc_nr']

wc_info_bayoba_1_df  0      21146.0
                     1      21833.0
                     2      21147.0
                     3      22986.0
                     4      21148.0
                             ...   
wc_info__WCOrg_1_df  560     1057.0
                     561    21040.0
                     562     2555.0
                     563     2066.0
                     564     1069.0
Name: wc_nr, Length: 11517, dtype: float64

In [2970]:
concated_farmers['ward_nr'].duplicated().value_counts()

True     20576
False       34
Name: ward_nr, dtype: int64

In [2971]:
concated_farmers['ward_nr'].is_unique

False

In [2972]:
concated_farmers['ward_nr'].unique()

array([11.,  5.,  3., 18.,  7., 15.,  2., 19., 10., 16.,  8.,  9.,  6.,
       17., 12.,  4., 13., 14., nan, 23., 21., 27.,  1., 25., 20., 29.,
       33., 24., 28., 30., 31., 34., 32., 22.])

In [2973]:
concated_farmers.iloc[0]

id                                         4169
og_nr                                   20459.0
og_code                                    BNOG
production_unit              Matabeleland North
area                                      Binga
ward_nr/name                            11 Lubu
species                                 rosella
surname                                  Muleya
first_names                              Jenete
sex                                      female
id_number                          06-016111A06
date_of_birth               1972-07-17 00:00:00
date_contracted             2023-02-24 00:00:00
date_dropped_out                            NaT
reason_dropped_out                         None
address                                    None
organic_status                              new
organic_status_from_date             2023-02-24
fairtrade                                  True
ward_nr                                    11.0
wc_nr                                   

In [2974]:
concated_farmers.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 20610 entries, ('OG_Info__OGOrg_1_df', 1273) to ('wc_info__WCOrg_1_df', 564)
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype          
---  ------                    --------------  -----          
 0   id                        20610 non-null  int64          
 1   og_nr                     9093 non-null   float64        
 2   og_code                   9093 non-null   object         
 3   production_unit           20610 non-null  object         
 4   area                      20610 non-null  object         
 5   ward_nr/name              20190 non-null  object         
 6   species                   20610 non-null  object         
 7   surname                   20608 non-null  object         
 8   first_names               20606 non-null  object         
 9   sex                       20610 non-null  object         
 10  id_number                 18658 non-null  object         
 11  date_of_birth   

#### Upload the prepared farmer data

In [2975]:
table_name = 'all_farmers'

if engine!=None:
    try:
        concated_farmers.to_sql(name=table_name, # Name of SQL table
                                con=engine, # Engine or connection
                                if_exists='replace', # Drop the table before inserting new values 
                                schema=schema, # Use schmea that was defined earlier
                                index=False, # Write DataFrame index as a column
                                chunksize=5000, # Specify the number of rows in each batch to be written at a time
                                method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

  concated_farmers.to_sql(name=table_name, # Name of SQL table


The all_farmers table was imported successfully.
