## In this notebook, the datasets are sourced into dataframes, initial data cleaning and data sent to database (Postgres)

## Data Cleaning for fields dataset

Columns in fields dataset

id                        
og_nr                   
date_updated             
plot_name                
area_(hectare)          
field_organic_status     
total_area_(hectare)    
wc_nr    

## Import necessary libraries


In [None]:
import pandas as pd
import sql_functions as sf

## Get data

In [19]:
schema = 'organic_africa' # UPDATE 'TABLE_SCHEMA' based on schema used in class 
engine = sf.get_engine()   

In [20]:
field_tables = {}
field_tables['OGFH_OGMB'] = sf.get_dataframe(f'SELECT * FROM {schema}."OGFH_OGMB"')
field_tables['OG_Field_History__OGOrg'] = sf.get_dataframe(f'SELECT * FROM {schema}."OG_Field_History__OGOrg"')
field_tables['organic_fh_bayoba'] = sf.get_dataframe(f'SELECT * FROM {schema}."organic_fh_bayoba"')

## Data cleaning

Set column names lower

In [21]:
field_tables['OGFH_OGMB'] = field_tables['OGFH_OGMB'][['ID', 'OG_Nr', 'Date_updated','Plot_name',
       'Area_(Hectare)', 'Field_Organic_Status']]
field_tables['OGFH_OGMB'].columns = map(str.lower, field_tables['OGFH_OGMB'].columns)

field_tables['OG_Field_History__OGOrg'] = field_tables['OG_Field_History__OGOrg'][['ID', 'OG_Nr', 'Date_updated', 'Plot_name', 'Area_(acre)',
       'Field_Organic_Status']]
field_tables['OG_Field_History__OGOrg'].columns = map(str.lower, field_tables['OG_Field_History__OGOrg'].columns)

field_tables['organic_fh_bayoba'] = field_tables['organic_fh_bayoba'][['ID', 'WCNr', 'Date_updated', 'Plot_name', 'Area_(acre)',
       'Field_Organic_Status']]
field_tables['organic_fh_bayoba'].columns = map(str.lower, field_tables['organic_fh_bayoba'].columns)

Renaming columns, replacing values, changing data type


In [22]:
field_tables['organic_fh_bayoba'].rename(columns = {"wcnr": "wc_nr", "area_(acre)": "area_(hectare)"}, inplace = True)

In [23]:
field_tables['OG_Field_History__OGOrg']['area_(acre)'] = field_tables['OG_Field_History__OGOrg']['area_(acre)'].replace({'o': '0'})

In [24]:
field_tables['OG_Field_History__OGOrg']['area_(acre)'] = field_tables['OG_Field_History__OGOrg']['area_(acre)'].astype(float)

In [25]:
field_tables['OG_Field_History__OGOrg'].rename(columns = {"area_(acre)": "area_(hectare)"}, inplace = True)

Creating new column

In [26]:
field_tables['OGFH_OGMB']['total_area_(hectare)'] = field_tables['OGFH_OGMB'].groupby(['og_nr', 'date_updated'])['area_(hectare)'].transform('sum')

In [27]:
field_tables['OG_Field_History__OGOrg']['total_area_(hectare)'] = field_tables['OG_Field_History__OGOrg'].groupby(['og_nr', 'date_updated'])['area_(hectare)'].transform('sum')

In [28]:
field_tables['organic_fh_bayoba']['total_area_(hectare)'] = field_tables['organic_fh_bayoba'].groupby(['wc_nr', 'date_updated'])['area_(hectare)'].transform('sum')

## Creating one joined dataframe

In [None]:
concated_fields = pd.concat(field_tables)

Know your data

Data cleaning

In [29]:
concated_fields.dtypes

id                        int64
og_nr                   float64
date_updated             object
plot_name                object
area_(hectare)          float64
field_organic_status     object
total_area_(hectare)    float64
wc_nr                   float64
dtype: object

In [30]:
concated_fields.head(10)

Unnamed: 0,Unnamed: 1,id,og_nr,date_updated,plot_name,area_(hectare),field_organic_status,total_area_(hectare),wc_nr
OGFH_OGMB,0,2867,13172.0,2017-09-18,1.0,0.4,Con,4.4,
OGFH_OGMB,1,2868,13172.0,2017-09-18,2.0,0.5,Con,4.4,
OGFH_OGMB,2,2869,13172.0,2017-09-18,3.0,0.5,Org,4.4,
OGFH_OGMB,3,2870,13172.0,2017-09-18,4.0,0.4,Con,4.4,
OGFH_OGMB,4,2871,13172.0,2017-09-18,5.0,0.8,Con,4.4,
OGFH_OGMB,5,2872,13172.0,2017-09-18,6.0,0.4,Con,4.4,
OGFH_OGMB,6,2873,13172.0,2017-09-18,7.0,0.4,Org,4.4,
OGFH_OGMB,7,2874,13172.0,2017-09-18,8.0,1.0,Con,4.4,
OGFH_OGMB,8,2875,13172.0,,,,,,
OGFH_OGMB,9,2876,13173.0,2021-05-06,1.0,0.2,Mabagrown,0.4,


In [31]:
concated_fields.duplicated().value_counts()

False    13987
True      2207
dtype: int64

In [32]:
concated_fields.drop_duplicates(subset=None, keep='first', inplace=True)

Changing data type

In [39]:
concated_fields['date_updated'] = pd.to_datetime(concated_fields['date_updated'],format= '%Y-%m-%d')

In [41]:
concated_fields.dtypes

id                               int64
og_nr                          float64
date_updated            datetime64[ns]
plot_name                       object
area_(hectare)                 float64
field_organic_status            object
total_area_(hectare)           float64
wc_nr                          float64
dtype: object

## Push the datasets to Postgres

In [43]:
import psycopg2
import sqlalchemy 
from sql_functions import sqlalchemy 
from sql_functions import get_engine 

In [None]:
# Write records stored in a dataframe to SQL database
engine = get_engine()
table_name = 'all_fields'
if engine!=None:
    try:
        concated_fields.to_sql(name='all_fields', # Name of SQL table variable - at the front its the dataframe!! 
                        con=engine, # Engine or connection
                        schema='organic_africa', # your class schema variable
                        if_exists='replace', # Drop the table before inserting new values 
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print('No engine')

The all_fields table was imported successfully.
