In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sql_functions as sf

schema = 'organic_africa' # UPDATE 'TABLE_SCHEMA' based on schema used in class 
engine = sf.get_engine()

In [58]:
farmer = sf.get_dataframe(f'SELECT * FROM {schema}.all_farmers')

purchase = sf.get_dataframe(f'SELECT * FROM {schema}.all_purchase')

field = sf.get_dataframe(f'SELECT * FROM {schema}.all_fields')

#### Reducing columns

In [59]:
farmer = farmer[['type','og_nr','wc_nr','sex','age_at_date_contracted','area','species',
                   'date_contracted','date_dropped_out','contract_duration','reason_dropped_out',
                   'organic_status','organic_status_from_date','fairtrade']]

In [60]:
purchase['type']=''
ind_wc = purchase[purchase['og_nr'].isna()].index
ind_og = purchase[purchase['wc_nr'].isna()].index

purchase['type'].iloc[ind_wc] = 'wc'
purchase['type'].iloc[ind_og] = 'og'

purchase['year_of_purchase'] = purchase['date_of_purchase'].dt.year.astype('Int64')

purchase = purchase[['year_of_purchase','date_of_purchase','type','og_nr','wc_nr','product','amount_in_kg', 'price_per_kg', 'total_payment_usd']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  purchase['type'].iloc[ind_wc] = 'wc'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  purchase['type'].iloc[ind_og] = 'og'


In [61]:
field['type']=''
ind_wc = field[field['og_nr'].isna()].index
ind_og = field[field['wc_nr'].isna()].index

field['type'].iloc[ind_wc] = 'wc'
field['type'].iloc[ind_og] = 'og'

field = field[['date_updated','type','og_nr','wc_nr','area_(hectare)','total_area_(hectare)','field_organic_status']]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field['type'].iloc[ind_wc] = 'wc'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field['type'].iloc[ind_og] = 'og'


#### Drop unnecessary rows with too many NA

In [62]:
field.drop(field[(field['date_updated'].isna()) & (field['area_(hectare)'].isna())].index, inplace=True)

#### Check og and wc duplicates

In [63]:

def og(df):
    og = df[~df['og_nr'].isna()]['og_nr']

    total = og.shape[0]

    unique = og.unique().shape[0]
    duplicated = og[og.duplicated()].shape[0]

    print('Number of OGs: {} TOTAL, {} UNIQUES, {} DUPS '.format(total,unique,duplicated))

    dup_og_df = df[df['og_nr'].isin(og[og.duplicated()])].sort_values('og_nr')
    
    return dup_og_df

def wc(df):
    wc = df[~df['wc_nr'].isna()]['wc_nr']

    total = wc.shape[0]

    unique = wc.unique().shape[0]
    duplicated = wc[wc.duplicated()].shape[0]

    print('Number of WCs: {} TOTAL, {} UNIQUES, {} DUPS '.format(total,unique,duplicated))

    dup_wc_df = df[df['wc_nr'].isin(wc[wc.duplicated()])].sort_values('wc_nr')
    
    return dup_wc_df

In [64]:
field_og = og(field)[['date_updated','type','og_nr','wc_nr','total_area_(hectare)','field_organic_status']]
field_wc = wc(field)[['date_updated','type','og_nr','wc_nr','total_area_(hectare)','field_organic_status']]

Number of OGs: 11215 TOTAL, 2629 UNIQUES, 8586 DUPS 
Number of WCs: 36 TOTAL, 8 UNIQUES, 28 DUPS 


In [65]:
# field1[field1['og_nr']==13172.0].groupby([field['date_updated'].dt.year.astype('Int64')])['area_(hectare)'].sum()

In [66]:
field_og.drop_duplicates(subset=['date_updated', 'og_nr'], keep='first', inplace=True)
field_wc.drop_duplicates(subset=['date_updated', 'wc_nr'], keep='first', inplace=True)

In [67]:
field = pd.concat([field_og,field_wc])
field

Unnamed: 0,date_updated,type,og_nr,wc_nr,total_area_(hectare),field_organic_status
6823,2017-06-09,og,2768.0,,4.05,Con
12471,2022-08-06,og,2770.0,,3.60,org uncert
6826,2022-01-21,og,2770.0,,3.60,org uncert
6829,2022-01-30,og,2771.0,,2.40,org uncert
12454,2022-08-15,og,2771.0,,2.40,org uncert
...,...,...,...,...,...,...
13953,2019-11-28,wc,,51269.0,0.98,Con
13959,2019-11-28,wc,,51270.0,1.60,Con
13972,2019-11-28,wc,,51271.0,1.09,Con
13975,2019-11-28,wc,,51272.0,3.00,Con


In [68]:
field.count()

date_updated            2781
type                    2783
og_nr                   2775
wc_nr                      8
total_area_(hectare)    2781
field_organic_status    2741
dtype: int64

In [69]:
## Growth of size of farmland growing for OA partners /year

In [70]:
field_og.groupby([field_og['date_updated'].dt.year.astype('Int64')])['total_area_(hectare)'].sum()

date_updated
2012       0.4000
2014       1.0000
2015       5.7000
2016      44.3100
2017    1532.6345
2018    1193.7440
2019      11.6000
2020      33.9000
2021    1967.3120
2022    2350.4450
2023       0.5000
2031       1.0000
Name: total_area_(hectare), dtype: float64

In [71]:
field_wc.groupby([field_wc['date_updated'].dt.year.astype('Int64')])['total_area_(hectare)'].sum()

date_updated
2019    14.64
Name: total_area_(hectare), dtype: float64

In [72]:
field.groupby([field['date_updated'].dt.year.astype('Int64'), 'field_organic_status'])['total_area_(hectare)'].mean()

date_updated  field_organic_status
2012          Con                     0.400000
2014          Con                     1.000000
2015          Con                     5.700000
2016          Con                     3.348571
              Org                     2.318889
2017          Con                     3.775536
              Org                     3.069363
2018          Con                     2.828143
              Org                     2.608201
2019          Con                     1.956000
              Org                     1.670000
2020          Con                     1.200000
              Org                     1.990909
2021          Con                     2.110356
              Mabagrown               1.976289
2022          C3                      1.500000
              Org                     2.721840
              org                     2.332857
              org uncert              2.734067
2023          Org                     0.500000
2031          Org        

In [73]:
field.groupby([field['date_updated'].dt.year.astype('Int64')])['field_organic_status'].value_counts()

date_updated  field_organic_status
2012          Con                       1
2014          Con                       1
2015          Con                       1
2016          Org                       9
              Con                       7
2017          Con                     253
              Org                     182
2018          Con                     265
              Org                     169
2019          Con                      10
              Org                       4
2020          Org                      11
              Con                      10
2021          Con                     646
              Mabagrown               304
2022          Org                     470
              org uncert              359
              org                      35
              C3                        1
2023          Org                       1
2031          Org                       1
Name: field_organic_status, dtype: int64

In [74]:
field.to_csv('field.csv')

----------------------

In [75]:
def og2(df):
    og = df[~df['og_nr'].isna()]['og_nr']

    total = og.shape[0]

    unique = og.unique().shape[0]
    duplicated = og[og.duplicated()].shape[0]

    print('Number of OGs: {} TOTAL, {} UNIQUES, {} DUPS '.format(total,unique,duplicated))

    dup_og_df = df[df['og_nr'].isin(og[og.duplicated()])].sort_values(['og_nr','date_of_purchase'])
    
    return dup_og_df

def wc2(df):
    wc = df[~df['wc_nr'].isna()]['wc_nr']

    total = wc.shape[0]

    unique = wc.unique().shape[0]
    duplicated = wc[wc.duplicated()].shape[0]

    print('Number of WCs: {} TOTAL, {} UNIQUES, {} DUPS '.format(total,unique,duplicated))

    dup_wc_df = df[df['wc_nr'].isin(wc[wc.duplicated()])].sort_values(['wc_nr','date_of_purchase'])
    
    return dup_wc_df

## Getting income from purchase table

In [76]:
purchase_og = og2(purchase)
purchase_wc = wc2(purchase)

Number of OGs: 10731 TOTAL, 3366 UNIQUES, 7365 DUPS 
Number of WCs: 14037 TOTAL, 5988 UNIQUES, 8049 DUPS 


In [77]:
purchase_og[purchase_og['og_nr']==2770.0]

Unnamed: 0,year_of_purchase,date_of_purchase,type,og_nr,wc_nr,product,amount_in_kg,price_per_kg,total_payment_usd
6648,2016,2016-07-20,og,2770.0,,Rosella (subdariffa) petals (org),103.4,1.1,114.0
7737,2017,2017-09-08,og,2770.0,,Rosella (subdariffa) petals (org),11.98,1.1,13.0
7818,2017,2017-11-26,og,2770.0,,Strophanthus (kombe) seed (con),93.0,10.0,930.0
8267,2018,2018-07-16,og,2770.0,,Rosella (subdariffa) petals (org),102.0,1.1,112.2
8353,2018,2018-08-13,og,2770.0,,Rosella (subdariffa) seed (org),61.0,0.5,30.5
9032,2019,2019-09-14,og,2770.0,,Rosella (subdariffa) petals (org),19.7,8.0,173.36
9039,2019,2019-09-14,og,2770.0,,Rosella (subdariffa) petals (org),19.3,8.0,137.84
9044,2019,2019-10-30,og,2770.0,,Rosella (subdariffa) petals (org),19.7,15.0,122.0
9051,2019,2019-10-30,og,2770.0,,Rosella (subdariffa) petals (org),19.3,15.0,152.0
9337,2020,2020-07-27,og,2770.0,,Rosella (subdariffa) petals (org),72.4,1.1,79.64


In [78]:
purchase_og['product'].value_counts()

Rosella (subdariffa) petals (org)      6776
Bird's Eye Chilli fruit whole (org)    1154
Rosella (subdariffa) seed (org)         984
Paprika fruit whole (org)               378
Strophanthus (kombe) seed (con)         163
Safflower petals (org)                    7
Name: product, dtype: int64

In [79]:
purchase_wc['product'].value_counts()

Baobab fruit whole (org)             6724
Gotu kola leaves (org)               2018
Devil’s Claw root (org)               499
Baobab whole fruit (con)              488
Devil’s Claw root (con)               311
Ximenia Americana seed (org)          152
Trichillia emetica                    134
Gotu kola leaves (con)                105
Baobab pulp and seed (org)            105
Ximenia Caffra Seed (org)              78
Devil’s Claw plant part (org)          75
Trichillia emetica (org)               68
Kalahari melon seed (org)              47
Devil’s Claw plant part (con)          32
Strophanthus (kombe) seed (org)        20
Marula seed (org)                      18
Marula Kennels                         15
Rosella petals (con)                   11
Marula oil (org)                        3
Strophanthus (gratus) seed (org)        3
Devil’s Claw seed (con)                 2
Rosella (subdariffa) petals (org)       2
Mongongo seed (org)                     2
Contact Farmer Bonus              

In [80]:
purchase = pd.concat([purchase_og,purchase_wc])
purchase

Unnamed: 0,year_of_purchase,date_of_purchase,type,og_nr,wc_nr,product,amount_in_kg,price_per_kg,total_payment_usd
6648,2016,2016-07-20,og,2770.0,,Rosella (subdariffa) petals (org),103.40,1.10,114.00
7737,2017,2017-09-08,og,2770.0,,Rosella (subdariffa) petals (org),11.98,1.10,13.00
7818,2017,2017-11-26,og,2770.0,,Strophanthus (kombe) seed (con),93.00,10.00,930.00
8267,2018,2018-07-16,og,2770.0,,Rosella (subdariffa) petals (org),102.00,1.10,112.20
8353,2018,2018-08-13,og,2770.0,,Rosella (subdariffa) seed (org),61.00,0.50,30.50
...,...,...,...,...,...,...,...,...,...
21691,2021,2021-06-22,wc,,52993.0,Baobab fruit whole (org),53.00,0.12,6.35
21701,2021,2021-06-22,wc,,52993.0,Baobab fruit whole (org),220.00,0.12,26.40
23933,2022,2022-06-14,wc,,52993.0,Baobab fruit whole (org),932.00,0.14,130.48
21694,2021,2021-06-22,wc,,52996.0,Baobab fruit whole (org),284.00,0.12,34.08


In [81]:
purchase.to_csv('purchase.csv')

------------------------------------

In [115]:
over_time = farmer.copy()
over_time.set_index(over_time['date_contracted'].dt.year.astype('Int64'))

Unnamed: 0_level_0,type,og_nr,wc_nr,sex,age_at_date_contracted,area,species,date_contracted,date_dropped_out,contract_duration,reason_dropped_out,organic_status,organic_status_from_date,fairtrade
date_contracted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2023,og,20183.0,,female,,Binga,rosella,2023-02-24,NaT,,,new,2023-02-24,True
2023,og,20290.0,,female,66.0,Binga,rosella,2023-02-24,NaT,,,new,2023-02-24,True
2023,og,20254.0,,female,43.0,Binga,,2023-02-24,NaT,,,new,2023-02-24,True
2023,og,20255.0,,female,,Binga,,2023-02-24,NaT,,,new,2023-02-24,True
2023,og,20256.0,,female,,Binga,,2023-02-24,NaT,,,new,2023-02-24,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,wc,,1057.0,female,,Chimanimani,wild gotu kola,NaT,2021-01-29,,,reinstated,2019-03-09,False
,wc,,21040.0,female,,Chimanimani,wild gotu kola,NaT,2021-01-29,,,reinstated,2019-03-09,False
,wc,,2555.0,female,,Chimanimani,wild gotu kola,NaT,2021-01-29,,,reinstated,NaT,False
,wc,,2066.0,female,,Chimanimani,wild gotu kola,NaT,2021-01-29,,,reinstated,2019-03-09,False


In [116]:
farmer['type'].value_counts()

wc    11517
og     9093
Name: type, dtype: int64

In [117]:
type_sex_pivot = farmer.pivot_table(index=['type','sex'], columns='area')

  type_sex_pivot = farmers.pivot_table(index=['type','sex'], columns='area')


In [248]:
#by_type = farmers.set_index(['type'])
#trend_over_years = by_type.loc['og'].groupby([by_type['date_contracted'].dt.year.astype('Int64')])['sex'].value_counts()

In [None]:
farmer[(farmer['type']=='wc') & (farmer['sex']=='female')][['area']].value_counts()

area                
Chimanimani             1452
Rushinga                1207
Mudzi                   1110
Binga                    918
Buhera                   885
Chipinge                 784
Mt Darwin                521
Beitbridge               509
Kwekwe                   241
Hwange                   190
Uzumbamarambapfungwe     179
Mwenezi                  156
Mbire                    149
Mberengwa                115
Mutoko                    93
Chivi                     88
Matobo                    44
dtype: int64

In [None]:
farmer[farmer['type']=='wc'][['area']].value_counts()

area                
Rushinga                1750
Chimanimani             1660
Mudzi                   1590
Buhera                  1152
Binga                   1122
Chipinge                 977
Mt Darwin                968
Beitbridge               613
Mbire                    317
Hwange                   289
Kwekwe                   270
Uzumbamarambapfungwe     257
Mwenezi                  173
Mutoko                   126
Mberengwa                115
Chivi                     94
Matobo                    44
dtype: int64

In [None]:
#contracts_over_year = contracted.groupby([contracted['date_contracted'].dt.year.astype('Int64')])['date_contracted', 'og_nr', 'wc_nr', 'date_dropped_out'].count()