# Data exploration

In [4]:
import pandas as pd
import numpy as np

In [5]:
def analyze_missing_values(df):
    missing_values = df.isna().sum()
    total_rows = len(df)
    usable_data = total_rows - missing_values
    percentage_usable = (usable_data / total_rows) * 100
    analysis_df = pd.DataFrame({
        'NaN and Missing values': missing_values,
        'Usable data': usable_data,
        '(%) Usable Data': percentage_usable.round(2),
    })
    return analysis_df


In [33]:
def get_low_data_cols(df):
    percent_usable_data = (len(df) - df.isna().sum()) / len(df) * 100
    return [index for index, value in percent_usable_data.items() if value < 10.00]

## Corn

In [14]:
corn = pd.read_csv("datasets/1982-2020_corn.csv")
analyze_missing_values(corn)

Unnamed: 0,NaN and Missing values,Usable data,(%) Usable Data
YEAR,0,28854,100.0
COUNTY,0,28854,100.0
WATER_REGIME,0,28854,100.0
PCODE,0,28854,100.0
BRAND,7841,21013,72.83
NAME,6533,22321,77.36
YIELD,93,28761,99.68
TW,946,27908,96.72
MOIST,94,28760,99.67
DAYS,2291,26563,92.06


In [12]:
corn.head()

Unnamed: 0,YEAR,COUNTY,WATER_REGIME,PCODE,BRAND,NAME,YIELD,TW,MOIST,DAYS,HT,START_DATE,END_DATE
0,1982.0,Doniphan,Dryland,1538.0,,,134.63,58.5,20.95,,,,
1,1982.0,Doniphan,Dryland,1680.0,,,132.25,58.15,17.25,,,,
2,1982.0,Doniphan,Dryland,1370.0,,,146.87,56.7,17.25,,,,
3,1982.0,Doniphan,Dryland,1539.0,,,136.69,56.85,20.9,,,,
4,1982.0,Doniphan,Dryland,1190.0,,,132.52,55.45,16.95,,,,


In [13]:
corn.drop(columns=['HT', 'START_DATE', 'END_DATE'], inplace=True)
corn.head()

Unnamed: 0,YEAR,COUNTY,WATER_REGIME,PCODE,BRAND,NAME,YIELD,TW,MOIST,DAYS
0,1982.0,Doniphan,Dryland,1538.0,,,134.63,58.5,20.95,
1,1982.0,Doniphan,Dryland,1680.0,,,132.25,58.15,17.25,
2,1982.0,Doniphan,Dryland,1370.0,,,146.87,56.7,17.25,
3,1982.0,Doniphan,Dryland,1539.0,,,136.69,56.85,20.9,
4,1982.0,Doniphan,Dryland,1190.0,,,132.52,55.45,16.95,


## Soybean

In [29]:
soybean = pd.read_csv("datasets/1991-2022_soybean.csv")
analyze_missing_values(soybean)

Unnamed: 0,NaN and Missing values,Usable data,(%) Usable Data
YEAR,0,19381,100.0
LOC,0,19381,100.0
CITY,18055,1326,6.84
WATER_REGIME,18055,1326,6.84
PCODE,0,19381,100.0
BRAND,1360,18021,92.98
NAME,605,18776,96.88
YIELD,0,19381,100.0


## Sunflower

In [36]:
sunflower = pd.read_csv("datasets/1998-2019_sunflower.csv")
analyze_missing_values(sunflower)

Unnamed: 0,NaN and Missing values,Usable data,(%) Usable Data
Year,0,3092,100.0
Loc,0,3092,100.0
Brand,895,2197,71.05
Name,200,2892,93.53
Yield,0,3092,100.0
Days,57,3035,98.16


## Wheat

In [41]:
wheat = pd.read_csv("datasets/1982-2022_wheat.csv")
analyze_missing_values(wheat)

Unnamed: 0,NaN and Missing values,Usable data,(%) Usable Data
YEAR,0,24299,100.0
LOC,0,24299,100.0
BRAND,6800,17499,72.02
NAME,6219,18080,74.41
YIELD,165,24134,99.32
TW,689,23610,97.16
MOIST,7261,17038,70.12
HEAD,4932,19367,79.7
HT,3674,20625,84.88


## General

In [45]:
print(get_low_data_cols(corn))
print(get_low_data_cols(soybean))
print(get_low_data_cols(sunflower))
print(get_low_data_cols(wheat))

['HT', 'START_DATE', 'END_DATE']
['CITY', 'WATER_REGIME']
[]
[]
