In [1]:
import os
import pandas as pd
import numpy as np


In [2]:
def load_and_concat(paths):
    dfs = []
    for p in paths:
        if not os.path.exists(p):
            raise FileNotFoundError(f"Missing file: {p}")
        
        df = pd.read_csv(p)
        df['source_file'] = os.path.basename(p)
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True, sort=False)


In [3]:
def find_first(cols, keywords):
    for k in keywords:
        for c in cols:
            if k in c.lower():
                return c
    return None


In [4]:
def build_composite_and_label(
    df,
    safety_keywords=["safety","crime","security","police","24x7"],
    infra_keywords=["infra","infrastructure","road","area","connect","transport"],
    env_keywords=["env","pollution","air","green","water","rain","rainwater","waste"],
    weights=(0.4, 0.35, 0.25)
):
    colnames = [c for c in df.columns if isinstance(c, str)]

    safety_col = find_first(colnames, safety_keywords)
    infra_col = find_first(colnames, infra_keywords)
    env_col = find_first(colnames, env_keywords)

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_candidates = [c for c in numeric_cols if c.lower() not in ("id","index","serial","sr","sno")]

    if safety_col is None and len(numeric_candidates) >= 1:
        safety_col = numeric_candidates[0]

    if infra_col is None and len(numeric_candidates) >= 2:
        infra_col = numeric_candidates[1]

    if env_col is None and len(numeric_candidates) >= 3:
        env_col = numeric_candidates[2]

    for c in [safety_col, infra_col, env_col]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    def minmax(series):
        if series.isnull().all():
            return series.fillna(0)
        mn = series.min()
        mx = series.max()
        if pd.isna(mn) or pd.isna(mx) or mx == mn:
            return series.fillna(0)
        return (series - mn) / (mx - mn)

    df['_safety_n'] = minmax(df[safety_col]) if safety_col in df.columns else 0
    df['_infra_n'] = minmax(df[infra_col]) if infra_col in df.columns else 0
    df['_env_n'] = minmax(df[env_col]) if env_col in df.columns else 0

    env_name = (env_col or "").lower()
    if any(k in env_name for k in ["pollut", "air", "pm2", "pm10", "noise", "contamin"]):
        df['_env_n'] = 1 - df['_env_n']

    w_s, w_i, w_e = weights

    df['_composite_score'] = (
        w_s * df['_safety_n'].fillna(0) +
        w_i * df['_infra_n'].fillna(0) +
        w_e * df['_env_n'].fillna(0)
    )

    df['recommendation_label'] = pd.cut(
        df['_composite_score'],
        bins=[-0.01, 0.33, 0.67, 1.01],
        labels=[0, 1, 2]
    ).astype(int)

    return df, (safety_col, infra_col, env_col)


In [8]:
paths = [
    "E:\Project_land_Recommender\data\Delhi1.csv",
    "E:\Project_land_Recommender\data\Mumbai1.csv",
    "E:\Project_land_Recommender\data\Kolkata1.csv",
    "E:\Project_land_Recommender\data\Chennai1.csv"
]

df = load_and_concat(paths)
df, used_columns = build_composite_and_label(df)

print("Detected Columns:", used_columns)
df.head()


Detected Columns: ('24X7Security', 'Area', 'RainWaterHarvesting')


Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,StaffQuarter,Cafeteria,MultipurposeRoom,Hospital,source_file,_safety_n,_infra_n,_env_n,_composite_score,recommendation_label
0,,,,,,,,,,,...,,,,,Delhi1.csv,,,,0.0,0
1,10500000.0,1200.0,Sector 10 Dwarka,2.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,Delhi1.csv,0.111111,0.063291,0.0,0.066596,0
2,6000000.0,1000.0,Uttam Nagar,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,Delhi1.csv,0.0,0.050633,0.0,0.017722,0
3,15000000.0,1350.0,Sarita Vihar,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,Delhi1.csv,0.0,0.072785,0.0,0.025475,0
4,2500000.0,435.0,Uttam Nagar,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,Delhi1.csv,0.0,0.014873,0.0,0.005206,0
