In [None]:
# !pip install numpy
# !pip install pandas
# !pip install pandas_profiling
# !pip install psycopg2-binary
# !pip install sqlalchemy
# !pip install sklearn

In [None]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

In [None]:
# Get dataframes from PostgreSQL
import psycopg2
from sqlalchemy import create_engine

# Create an engine instance
alchemyEngine = create_engine('postgresql+psycopg2://postgres:postgres@10.4.41.39/species_rasters', pool_recycle=3600);

# Connect to PostgreSQL server
dbConnection = alchemyEngine.connect();

# Read data from PostgreSQL database table and load into a DataFrame instance
samples = pd.read_sql("select * from samples", dbConnection);
background = pd.read_sql("select * from background", dbConnection);

# Close the database connection
dbConnection.close();

In [None]:
display(ProfileReport(samples, title='Exploratory Data Analysis Report (samples)', minimal = True))
display(ProfileReport(background, title='Exploratory Data Analysis Report (background)', minimal = True))

It is instantly clear that data cleaning is necessary to carry out meaningful data analysis

In [None]:
# This is the value used for no-data in the NEO rasters
neo_nan = 99999.0

samples.loc[samples['SRTM_RAMP2_TOPO'] == neo_nan, 'SRTM_RAMP2_TOPO'] = np.nan
samples.loc[samples['MOD_LSTD_M'] == neo_nan, 'MOD_LSTD_M'] = np.nan
display(samples)

for column in background.columns:
    if column != 'species':
        background[column] = background[column].astype(float)
background.loc[background['SRTM_RAMP2_TOPO'] == neo_nan, 'SRTM_RAMP2_TOPO'] = np.nan
background.loc[background['MOD_LSTD_M'] == neo_nan, 'MOD_LSTD_M'] = np.nan
display(background)

Deal with missing values

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
# Impute the data
background_imp = background
samples_imp = samples

imp = IterativeImputer(max_iter=100, random_state=42)
cols = [col for col in background.columns if col != 'species']

imp.fit(background_imp[cols])
background_imp[cols] = imp.transform(background_imp[cols])

imp.fit(samples_imp[cols])
samples_imp[cols] = imp.transform(samples_imp[cols])

# Replace by constant
background_const = background
samples_const = samples


In [None]:
display(ProfileReport(samples, title='Exploratory Data Analysis Report (samples)'))
display(ProfileReport(background, title='Exploratory Data Analysis Report (background)'))

In [None]:
background.to_csv('background.csv', index=False)
samples.to_csv('samples.csv', index=False)
background_imp.to_csv('background_imp.csv', index=False)
samples_imp.to_csv('samples_imp.csv', index=False)
background_const.to_csv('background_const.csv', index=False)
samples_const.to_csv('samples_const.csv', index=False)