Check to see what the data types are of the columns in ReFED_US_State_Food_Surplus_Detail.csv


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../data/raw/ReFED_US_State_Food_Surplus_Detail.csv', skiprows = 1, nrows = 1000)

In [None]:
df.info()

In [None]:
df = pd.read_csv('../data/raw/ReFED_US_State_Food_Surplus_Summary.csv', skiprows = 1, nrows = 1000)

In [None]:
print(df.columns.tolist())

In [None]:
df.info()

In [None]:
import great_expectations as gx
import pandas as pd
from pathlib import Path

# Load your CSV
csv_path = Path("../data/raw/ReFED_US_Food_Surplus_Summary.csv").resolve()
df = pd.read_csv(csv_path, skiprows=1)

print("Raw DataFrame columns:", df.columns.tolist())

# Start Great Expectations context
context = gx.get_context(mode="ephemeral")

# This works in 0.18.11 to create a Validator directly from a DataFrame
validator = context.sources.pandas_default.read_dataframe(df)

# Print the actual GE-tracked DataFrame columns
print("GE batch columns:", validator.head().columns.tolist())

# Define expectations
validator.expect_column_values_to_not_be_null("tons_surplus")
validator.expect_column_values_to_be_between("tons_surplus", min_value=0)

# Run validation
results = validator.validate()
print("Success?", results["success"])


In [None]:
import pandas as pd
import sys
from pathlib import Path

# Adds the parent of "scripts" (project root) to the Python path
sys.path.append(str(Path().resolve().parent))

from scripts.cleaning.constants import get_column_diff, get_column_overlap

surplus_summary_path       = Path("../data/raw/ReFED_US_Food_Surplus_Summary.csv").resolve()
surplus_cause_summary_path = Path("../data/raw/ReFED_US_Food_Surplus_Cause_Summary.csv").resolve()
surplus_detail_path        = Path("../data/raw/ReFED_US_Food_Surplus_Detail.csv").resolve()
surplus_state_summary_path = Path("../data/raw/ReFED_US_State_Food_Surplus_Summary.csv").resolve()
surplus_state_detail_path  = Path("../data/raw/ReFED_US_State_Food_Surplus_Detail.csv").resolve()
df_surplus_summary         = pd.read_csv(surplus_summary_path, skiprows=1)
df_surplus_cause_summary   = pd.read_csv(surplus_cause_summary_path, skiprows=1)
df_surplus_detail          = pd.read_csv(surplus_state_detail_path, skiprows=1)
df_surplus_state_summary   = pd.read_csv(surplus_state_summary_path, skiprows=1)
df_surplus_state_detail    = pd.read_csv(surplus_state_detail_path, skiprows=1)



In [None]:
shared_cols_total = get_column_overlap(df_surplus_summary, df_surplus_cause_summary, df_surplus_detail, df_surplus_state_detail, df_surplus_state_summary)
diff_surplus_summary = get_column_diff(df_surplus_summary, df_surplus_cause_summary)
diff_surplus_cause_summary = get_column_diff(df_surplus_cause_summary, df_surplus_summary)

#diff_surplus_summary = get_column_diff(df_surplus_summary, df_surplus_cause_summary)
#diff_surplus_cause_summary = get_column_diff(df_surplus_cause_summary, df_surplus_summary)

print("Shared columns:\n", sorted(shared_cols_total))
print("\nColumns only in surplus_summary:\n", sorted(diff_surplus_summary))
print("\nColumns only in surplus_cause_summary:\n", sorted(diff_surplus_cause_summary))

# I'm thinking that the states are going to share columns, but also that the causes are the outlier
shared_cols_state = get_column_overlap(df_surplus_state_detail, df_surplus_state_summary)
diff_surplus_state_summary = get_column_diff(df_surplus_state_summary, df_surplus_state_detail)
diff_surplus_state_detail = get_column_diff(df_surplus_state_detail, df_surplus_state_summary)
print("Shared state columns:\n", sorted(shared_cols_state))
print("\nColumns only in surplus_states_summary:\n", sorted(diff_surplus_state_summary))
print("\nColumns only in surplus_states_detail:\n", sorted(diff_surplus_state_detail))

# Probably the state and US schema for detail are the same
shared_cols_detail = get_column_overlap(df_surplus_detail, df_surplus_state_detail)
diff_cols_detail = get_column_diff(df_surplus_detail, df_surplus_state_detail)
diff_cols_state_detail = get_column_diff(df_surplus_detail, df_surplus_state_detail)
print("\nShared columns in *_detail:\n", sorted(shared_cols_detail))
print("\nColumns only in surplus_detail:\n", sorted(diff_cols_detail))
print("\nColumns only in surplus_states_detail:\n", sorted(diff_cols_state_detail))




In [None]:
# Same with the summary (US vs state), except the states_summary has the state as a column

shared_cols_summary     = get_column_overlap(df_surplus_summary, df_surplus_state_summary)
diff_cols_summary       = get_column_diff(df_surplus_summary, df_surplus_state_summary)
diff_cols_state_summary = get_column_diff(df_surplus_state_summary, df_surplus_summary)
print("\nShared columns in *_summary:\n", sorted(shared_cols_summary))
print("\nColumns only in surplus_summary:\n", sorted(diff_cols_summary))
print("\nColumns only in surplus_states_summary:\n", sorted(diff_cols_state_summary))