In [4]:
from pathlib import Path
import pandas as pd
import json
from collections import Counter

ROOT = Path().resolve()
RAW = ROOT / "D:/project=2/Agrigpt/data_csv"
REPORTS = ROOT / "reports"
REPORTS.mkdir(parents=True, exist_ok=True)

SOURCES = {
    "bangladesh_agri": ["*.csv"],
    "spas_bd": ["SPAS-Dataset-BD*.csv"],
    "uddipok": ["RC_Dataset_v2*.csv"],
}

print("Root:", ROOT)
print("Raw dir:", RAW)
list(RAW.glob("*"))


Root: D:\project=2\Agrigpt\notebook
Raw dir: D:\project=2\Agrigpt\data_csv


[WindowsPath('D:/project=2/Agrigpt/data_csv/raw')]

In [5]:
import json
from pathlib import Path

RAW = Path("D:/project=2/Agrigpt/data_csv/raw")

def load_metadata(source: str):
    meta = RAW / source / "metadata.jsonl"
    if not meta.exists():
        return []
    return [json.loads(l) for l in meta.read_text(encoding="utf-8").splitlines()]

for src in ["bangladesh_agri", "spas_bd", "uddipok"]:
    rows = load_metadata(src)
    print(f"{src}: {len(rows)} records")


bangladesh_agri: 5 records
spas_bd: 4 records
uddipok: 3 records


In [6]:
records = []
for src, patterns in SOURCES.items():
    for pat in patterns:
        for f in (RAW / src).glob(pat):
            try:
                df = pd.read_csv(f, nrows=5)
                shape = pd.read_csv(f).shape
                records.append({
                    "source": src,
                    "file": f.name,
                    "rows": shape[0],
                    "cols": shape[1],
                    "columns": list(df.columns),
                })
            except Exception as e:
                records.append({
                    "source": src,
                    "file": f.name,
                    "error": str(e),
                })

audit_df = pd.DataFrame(records)
audit_df


Unnamed: 0,source,file,rows,cols,columns
0,bangladesh_agri,00._Bangladesh_Agricultural_Raw_Data.csv,103,11,"[Season, Transplant, Growth, Harvest, Products..."
1,bangladesh_agri,00._Bangladesh_Agricultural_Raw_Data_.csv,103,11,"[Season, Transplant, Growth, Harvest, Products..."
2,spas_bd,SPAS-Dataset-BD.csv,4608,15,"[Area, AP Ratio, District, Season, Avg Temp, A..."
3,uddipok,RC_Dataset_v2.csv,3799,4,"[Unnamed: 0, Passage, Question, AnsText]"


In [7]:
for src, patterns in SOURCES.items():
    for pat in patterns:
        for f in (RAW / src).glob(pat):
            print(f"\n=== {src}: {f.name} ===")
            try:
                df = pd.read_csv(f)
                print("Shape:", df.shape)
                print("Columns:", df.columns.tolist())
                print(df.dtypes)
                print("Null counts:\n", df.isnull().sum().sort_values(ascending=False).head())
            except Exception as e:
                print("Error loading:", e)



=== bangladesh_agri: 00._Bangladesh_Agricultural_Raw_Data.csv ===
Shape: (103, 11)
Columns: ['Season', 'Transplant', 'Growth', 'Harvest', 'Products name', 'Crops Type', 'Max Temp', 'Min Temp', 'Max Relative Humidity', 'Min Relative Humidity', 'Country']
Season                    object
Transplant                object
Growth                    object
Harvest                   object
Products name             object
Crops Type                object
Max Temp                 float64
Min Temp                 float64
Max Relative Humidity      int64
Min Relative Humidity      int64
Country                   object
dtype: object
Null counts:
 Season           0
Transplant       0
Growth           0
Harvest          0
Products name    0
dtype: int64

=== bangladesh_agri: 00._Bangladesh_Agricultural_Raw_Data_.csv ===
Shape: (103, 11)
Columns: ['Season', 'Transplant', 'Growth', 'Harvest', 'Products name', 'Crops Type', 'Max Temp', 'Min Temp', 'Max Relative Humidity', 'Min Relative Humidity', '

In [8]:
summary = audit_df[["source","file","rows","cols"]]
out_path = REPORTS / "data_audit_summary.csv"
summary.to_csv(out_path, index=False)
print("Saved summary →", out_path)
summary


Saved summary → D:\project=2\Agrigpt\notebook\reports\data_audit_summary.csv


Unnamed: 0,source,file,rows,cols
0,bangladesh_agri,00._Bangladesh_Agricultural_Raw_Data.csv,103,11
1,bangladesh_agri,00._Bangladesh_Agricultural_Raw_Data_.csv,103,11
2,spas_bd,SPAS-Dataset-BD.csv,4608,15
3,uddipok,RC_Dataset_v2.csv,3799,4


In [9]:
from pathlib import Path
import json, pandas as pd

ROOT = Path().resolve()
RAW = ROOT / "D:/project=2/Agrigpt/data_csv/raw"
INTERIM = ROOT / "data" / "interim"
INTERIM.mkdir(parents=True, exist_ok=True)

SRC = {
    "bangladesh_agri": next((RAW/"bangladesh_agri").glob("*.csv")),
    "spas_bd": next((RAW/"spas_bd").glob("SPAS-Dataset-BD*.csv")),
    "uddipok": next((RAW/"uddipok").glob("RC_Dataset_v2*.csv")),
}

def load_meta(source):
    p = RAW/source/"metadata.jsonl"
    return [] if not p.exists() else [json.loads(l) for l in p.read_text(encoding="utf-8").splitlines()]


In [10]:
def dedupe_metadata(source):
    p = RAW/source/"metadata.jsonl"
    if not p.exists(): 
        return 0
    lines = [json.loads(l) for l in p.read_text(encoding="utf-8").splitlines()]
    seen, out = set(), []
    for rec in lines:
        key = (rec.get("file_name"), rec.get("sha256"))
        if key not in seen:
            seen.add(key); out.append(rec)
    p.write_text("\n".join(json.dumps(r, ensure_ascii=False) for r in out) + "\n", encoding="utf-8")
    return len(lines) - len(out)

for s in ["bangladesh_agri","spas_bd","uddipok"]:
    removed = dedupe_metadata(s)
    print(f"{s}: removed {removed} duplicate metadata lines")


bangladesh_agri: removed 3 duplicate metadata lines
spas_bd: removed 3 duplicate metadata lines
uddipok: removed 2 duplicate metadata lines


In [11]:
import hashlib, os

def sha256(path, chunk=1<<20):
    h=hashlib.sha256()
    with open(path,"rb") as f:
        while True:
            b=f.read(chunk)
            if not b: break
            h.update(b)
    return h.hexdigest()

bagri_dir = RAW/"bangladesh_agri"
files = list(bagri_dir.glob("*.csv"))
hashes = {}
for f in files:
    h = sha256(f)
    print(f.name, h)
    if h in hashes:
        # keep the first, remove the duplicate
        print("  -> duplicate; removing", f.name)
        os.remove(f)
    else:
        hashes[h] = f


00._Bangladesh_Agricultural_Raw_Data.csv 66b68e2b6f3e92ae61872378b106e156805ebffc356ee324e55a741da91006b3
00._Bangladesh_Agricultural_Raw_Data_.csv 66b68e2b6f3e92ae61872378b106e156805ebffc356ee324e55a741da91006b3
  -> duplicate; removing 00._Bangladesh_Agricultural_Raw_Data_.csv


In [12]:
issues = []

# Bangladesh Agri
ba = pd.read_csv(SRC["bangladesh_agri"])
issues.append({"source":"bangladesh_agri","file":SRC["bangladesh_agri"].name,
               "rows":len(ba), "null_Season":ba["Season"].isna().sum(),
               "notes":"OK; check duplicate file removed"})

# SPAS
sp = pd.read_csv(SRC["spas_bd"])
issues.append({"source":"spas_bd","file":SRC["spas_bd"].name,
               "rows":len(sp), "null_Season":sp["Season"].isna().sum(),
               "notes":"AP Ratio is object; 1 missing Season"})

# UDDIPOK
ud = pd.read_csv(SRC["uddipok"])
issues.append({"source":"uddipok","file":SRC["uddipok"].name,
               "rows":len(ud), "null_Question":ud["Question"].isna().sum(),
               "notes":"Drop Unnamed: 0; remove rows with null Question"})

issues_df = pd.DataFrame(issues)
issues_df


Unnamed: 0,source,file,rows,null_Season,notes,null_Question
0,bangladesh_agri,00._Bangladesh_Agricultural_Raw_Data.csv,103,0.0,OK; check duplicate file removed,
1,spas_bd,SPAS-Dataset-BD.csv,4608,1.0,AP Ratio is object; 1 missing Season,
2,uddipok,RC_Dataset_v2.csv,3799,,Drop Unnamed: 0; remove rows with null Question,2.0


In [13]:
# ---------- Bangladesh Agri: clean ----------
ba = (ba
      .rename(columns={
          "Products name":"crop_name",
          "Crops Type":"crop_type",
          "Max Temp":"max_temp_c",
          "Min Temp":"min_temp_c",
          "Max Relative Humidity":"max_rh",
          "Min Relative Humidity":"min_rh",
          "Season":"season",
          "Transplant":"transplant",
          "Growth":"growth",
          "Harvest":"harvest",
          "Country":"country",
      }))
# types
for col in ["max_temp_c","min_temp_c","max_rh","min_rh"]:
    ba[col] = pd.to_numeric(ba[col], errors="coerce")
for col in ["season","transplant","growth","harvest","crop_name","crop_type","country"]:
    ba[col] = ba[col].astype(str).str.strip()

ba_out = INTERIM/"bangladesh_agri_clean.csv"
ba.to_csv(ba_out, index=False)
print("Wrote", ba_out)

# ---------- SPAS: clean ----------
sp = (sp
      .rename(columns={
          "Area":"area_km2",
          "AP Ratio":"ap_ratio",
          "District":"district",
          "Season":"season",
          "Avg Temp":"avg_temp_c",
          "Avg Humidity":"avg_humidity",
          "Crop Name":"crop_name",
          "Transplant":"transplant",
          "Growth":"growth",
          "Harvest":"harvest",
          "Production":"production_tons",
          "Max Temp":"max_temp_c",
          "Min Temp":"min_temp_c",
          "Max Relative Humidity":"max_rh",
          "Min Relative Humidity":"min_rh",
      }))

# trim strings
for c in ["district","season","crop_name","transplant","growth","harvest","ap_ratio"]:
    sp[c] = sp[c].astype(str).str.strip()

# coerce numerics
for c in ["avg_temp_c","avg_humidity","production_tons","max_temp_c","min_temp_c","max_rh","min_rh","area_km2"]:
    sp[c] = pd.to_numeric(sp[c], errors="coerce")

# handle missing Season
sp["season"] = sp["season"].replace(["", "nan", "None"], pd.NA)
# choose: fill or drop
sp_missing = sp["season"].isna().sum()
print("SPAS missing season:", sp_missing)
# example: fill with "Unknown"
sp["season"] = sp["season"].fillna("Unknown")

sp_out = INTERIM/"spas_bd_clean.csv"
sp.to_csv(sp_out, index=False)
print("Wrote", sp_out)

# ---------- UDDIPOK: clean ----------
ud = ud.drop(columns=[c for c in ud.columns if c.lower().startswith("unnamed")], errors="ignore")
ud["Question"] = ud["Question"].astype(str).str.strip()
ud = ud[~ud["Question"].isna() & (ud["Question"].str.len() > 0)]  # drop empties

ud_out = INTERIM/"uddipok_clean.csv"
ud.to_csv(ud_out, index=False)
print("Wrote", ud_out)


Wrote D:\project=2\Agrigpt\notebook\data\interim\bangladesh_agri_clean.csv
SPAS missing season: 1
Wrote D:\project=2\Agrigpt\notebook\data\interim\spas_bd_clean.csv
Wrote D:\project=2\Agrigpt\notebook\data\interim\uddipok_clean.csv


In [15]:
summary = pd.DataFrame([
    {"source":"bangladesh_agri","file":SRC["bangladesh_agri"].name,"rows":len(ba),"cols":ba.shape[1]},
    {"source":"spas_bd","file":SRC["spas_bd"].name,"rows":len(sp),"cols":sp.shape[1]},
    {"source":"uddipok","file":SRC["uddipok"].name,"rows":len(ud),"cols":ud.shape[1]},
])
out = ROOT/"reports"/"data_audit_summary.csv"
summary.to_csv(out, index=False)
print("Saved →", out)
summary


Saved → D:\project=2\Agrigpt\notebook\reports\data_audit_summary.csv


Unnamed: 0,source,file,rows,cols
0,bangladesh_agri,00._Bangladesh_Agricultural_Raw_Data.csv,103,11
1,spas_bd,SPAS-Dataset-BD.csv,4608,15
2,uddipok,RC_Dataset_v2.csv,3799,3
