# Data exploration


In [1]:
import pandas as pd
import numpy as np

In [2]:
def analyze_missing_values(df):
    missing_values = df.isna().sum()
    total_rows = len(df)
    usable_data = total_rows - missing_values
    percentage_usable = (usable_data / total_rows) * 100
    analysis_df = pd.DataFrame(
        {
            "NaN and Missing values": missing_values,
            "Usable data": usable_data,
            "(%) Usable Data": percentage_usable.round(2),
        }
    )
    return analysis_df

In [3]:
def get_low_data_cols(df):
    percent_usable_data = (len(df) - df.isna().sum()) / len(df) * 100
    return [index for index, value in percent_usable_data.items() if value < 10.00]

In [4]:
def show_loc(df):
    unique_values, frequencies = np.unique(df.LOC, return_counts=True)
    for value, frequency in zip(unique_values, frequencies):
        print(f"Value: {value[:2]}, Frequency: {frequency}")

In [5]:
def check_dict_match(df):
    county_dict_file = 'raw/county_dict.csv'
    county_acronym_dict = pd.read_csv(county_dict_file, index_col='ACRONYM').index.to_list()
    unmatched_cases = df[~df['LOC'].str[:2].str.upper().isin(county_acronym_dict)]
    unmatched_counts = unmatched_cases['LOC'].str[:2].value_counts()
    print("Cases with no dictionary match:")
    total = 0
    for acronym, count in unmatched_counts.items():
        year = unmatched_cases[unmatched_cases['LOC'].str[:2] == acronym]['YEAR'].iloc[0]
        print(f"Acronym: {acronym}, Year: {year}, Frequency: {count}")
        total += count
    print(f"Total unknown: {total}, %{(total * 100 / len(df)):.2f}")

## Corn


In [6]:
corn = pd.read_csv("raw/1982-2020_corn.csv")
corn.head()

Unnamed: 0,YEAR,COUNTY,WATER_REGIME,PCODE,BRAND,NAME,YIELD,TW,MOIST,DAYS,HT,START_DATE,END_DATE
0,1982.0,Doniphan,Dryland,1538.0,,,134.63,58.5,20.95,,,,
1,1982.0,Doniphan,Dryland,1680.0,,,132.25,58.15,17.25,,,,
2,1982.0,Doniphan,Dryland,1370.0,,,146.87,56.7,17.25,,,,
3,1982.0,Doniphan,Dryland,1539.0,,,136.69,56.85,20.9,,,,
4,1982.0,Doniphan,Dryland,1190.0,,,132.52,55.45,16.95,,,,


In [7]:
analyze_missing_values(corn)

Unnamed: 0,NaN and Missing values,Usable data,(%) Usable Data
YEAR,0,28816,100.0
COUNTY,0,28816,100.0
WATER_REGIME,0,28816,100.0
PCODE,0,28816,100.0
BRAND,7839,20977,72.8
NAME,6533,22283,77.33
YIELD,93,28723,99.68
TW,946,27870,96.72
MOIST,94,28722,99.67
DAYS,2266,26550,92.14


In [8]:
unique_values, frequencies = np.unique(corn.COUNTY, return_counts=True)
for value, frequency in zip(unique_values, frequencies):
    print(f"Value: {value}, Frequency: {frequency}")

Value: Brown, Frequency: 778
Value: Clay, Frequency: 617
Value: Colby, Frequency: 235
Value: Doniphan, Frequency: 2713
Value: Ellis, Frequency: 455
Value: Finney, Frequency: 2731
Value: Franklin, Frequency: 2179
Value: Greeley, Frequency: 1818
Value: Harvey, Frequency: 480
Value: Labette, Frequency: 790
Value: Marshall, Frequency: 96
Value: McPherson, Frequency: 799
Value: Nemaha, Frequency: 316
Value: Neosho, Frequency: 807
Value: Ottawa, Frequency: 197
Value: Pottawatomie, Frequency: 433
Value: Reno, Frequency: 673
Value: Republic, Frequency: 3085
Value: Riley, Frequency: 1671
Value: Saline, Frequency: 242
Value: Shawnee, Frequency: 3159
Value: Stafford, Frequency: 2066
Value: Thomas, Frequency: 2386
Value: Thomas , Frequency: 57
Value: Wichita, Frequency: 33


## Soybean


In [105]:
soybean = pd.read_csv("raw/1991-2022_soybean.csv")
soybean.head()

Unnamed: 0,YEAR,LOC,CITY,WATER_REGIME,PCODE,BRAND,NAME,YIELD
0,1991,BRD,Powhattan,Dryland,306,,,17.8
1,1991,BRD,Powhattan,Dryland,307,,,19.4
2,1991,BRD,Powhattan,Dryland,496,,,22.7
3,1991,BRD,Powhattan,Dryland,552,,,20.5
4,1991,BRD,Powhattan,Dryland,614,,,19.4


In [106]:
analyze_missing_values(soybean)

Unnamed: 0,NaN and Missing values,Usable data,(%) Usable Data
YEAR,0,19381,100.0
LOC,0,19381,100.0
CITY,18055,1326,6.84
WATER_REGIME,18055,1326,6.84
PCODE,0,19381,100.0
BRAND,1360,18021,92.98
NAME,605,18776,96.88
YIELD,0,19381,100.0


In [107]:
check_dict_match(soybean)

Cases with no dictionary match:
Acronym: FN, Year: 1991, Frequency: 892
Acronym: GR, Year: 1999, Frequency: 38
Total unknown: 930, %4.80


## Sunflower


In [108]:
sunflower = pd.read_csv("raw/1998-2019_sunflower.csv")
sunflower.head()

Unnamed: 0,YEAR,LOC,BRAND,NAME,YIELD,DAYS
0,1998,GRDO,MYCOGEN SEEDS,CAVALRY,2580.0,63.0
1,1998,GRDO,DEKALB,DK 3790,1929.0,58.0
2,1998,GRDO,AGRIPRO SEEDS INC.,AP 3470,1972.0,59.0
3,1998,GRDO,DEKALB,DK 3868,2141.0,61.0
4,1998,GRDO,DEKALB,DK 3875,2290.0,62.0


In [109]:
analyze_missing_values(sunflower)


Unnamed: 0,NaN and Missing values,Usable data,(%) Usable Data
YEAR,0,3092,100.0
LOC,0,3092,100.0
BRAND,895,2197,71.05
NAME,200,2892,93.53
YIELD,0,3092,100.0
DAYS,57,3035,98.16


In [110]:
check_dict_match(sunflower)

Cases with no dictionary match:
Acronym: GR, Year: 1998, Frequency: 393
Acronym: gr, Year: 2005, Frequency: 308
Acronym: NU, Year: 2001, Frequency: 83
Acronym: SR, Year: 2001, Frequency: 63
Acronym: sr, Year: 2005, Frequency: 18
Total unknown: 865, %27.98


## Wheat


In [111]:
wheat = pd.read_csv("raw/1982-2022_wheat.csv")
wheat.head()

Unnamed: 0,YEAR,LOC,BRAND,NAME,YIELD,TW,MOIST,HEAD,HT
0,1982,BRD,,,41.97,60.0,,144.0,42.0
1,1982,BRD,,,44.71,55.75,,143.75,40.0
2,1982,BRD,,,40.32,55.0,,142.25,36.0
3,1982,BRD,,,40.66,58.75,,140.25,41.75
4,1982,BRD,,,35.93,56.0,,142.75,42.75


In [112]:
analyze_missing_values(wheat)

Unnamed: 0,NaN and Missing values,Usable data,(%) Usable Data
YEAR,0,24299,100.0
LOC,0,24299,100.0
BRAND,6800,17499,72.02
NAME,6219,18080,74.41
YIELD,165,24134,99.32
TW,689,23610,97.16
MOIST,7261,17038,70.12
HEAD,4932,19367,79.7
HT,3674,20625,84.88


In [113]:
check_dict_match(wheat)

Cases with no dictionary match:
Acronym: FN, Year: 1982, Frequency: 2310
Acronym: GR, Year: 1982, Frequency: 1910
Acronym: FD, Year: 2003, Frequency: 330
Acronym: NW, Year: 2003, Frequency: 269
Acronym: NC, Year: 2002, Frequency: 253
Acronym: IR, Year: 2003, Frequency: 191
Acronym: NE, Year: 2003, Frequency: 174
Acronym: SE, Year: 2003, Frequency: 137
Acronym: KI, Year: 2018, Frequency: 122
Acronym: PH, Year: 2002, Frequency: 103
Acronym: SO, Year: 2003, Frequency: 93
Total unknown: 5892, %24.25


## General


In [59]:
len(wheat)

24299

In [45]:
print(get_low_data_cols(corn))
print(get_low_data_cols(soybean))
print(get_low_data_cols(sunflower))
print(get_low_data_cols(wheat))

['HT', 'START_DATE', 'END_DATE']
['CITY', 'WATER_REGIME']
[]
[]


## Location Dict
