In [5]:
import argparse
import os
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

from preprocess import load_and_concat, build_composite_and_label


In [6]:
data_paths = [
    "E:\Project_land_Recommender\data\Delhi1.csv",
    "E:\Project_land_Recommender\data\Mumbai1.csv",
    "E:\Project_land_Recommender\data\Kolkata1.csv",
    "E:\Project_land_Recommender\data\Chennai1.csv"
]


In [7]:
print("\nLoading datasets...")
df = load_and_concat(data_paths)
print("Total records:", len(df))



Loading datasets...
Total records: 24242


In [8]:
print("\nGenerating features and labels...")

df, used_columns = build_composite_and_label(df)

safety_col, infra_col, env_col = used_columns
print("Columns used:")
print("Safety:", safety_col)
print("Infrastructure:", infra_col)
print("Environment:", env_col)



Generating features and labels...
Columns used:
Safety: 24X7Security
Infrastructure: Area
Environment: RainWaterHarvesting


In [9]:
os.makedirs("data", exist_ok=True)

processed_path = "data/combined_processed.csv"
df.to_csv(processed_path, index=False)

print(f"\n✅ Processed data saved at: {processed_path}")



✅ Processed data saved at: data/combined_processed.csv


In [10]:
features = ["_safety_n", "_infra_n", "_env_n", "_composite_score"]
target = "recommendation_label"

X = df[features].fillna(0)
y = df[target]

print("Feature shape:", X.shape)
print("Target shape:", y.shape)


Feature shape: (24242, 4)
Target shape: (24242,)


In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save scaler
os.makedirs("models", exist_ok=True)
joblib.dump(scaler, "models/scaler.joblib")

print("✅ Scaler saved at models/scaler.joblib")


✅ Scaler saved at models/scaler.joblib


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 19393
Testing samples: 4849


In [13]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

best_score = 0
best_model_name = None
best_model = None

print("\nTraining models...")

for name, model in models.items():
    print(f"\nTraining {name}...")

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    # Save each model
    model_path = f"models/{name}_model.joblib"
    joblib.dump(model, model_path)
    print(f"✅ Saved model at: {model_path}")

    # Track best model
    if acc > best_score:
        best_score = acc
        best_model_name = name
        best_model = model



Training models...

Training RandomForest...
RandomForest Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1086
           1       1.00      1.00      1.00      2154
           2       1.00      1.00      1.00      1609

    accuracy                           1.00      4849
   macro avg       1.00      1.00      1.00      4849
weighted avg       1.00      1.00      1.00      4849

✅ Saved model at: models/RandomForest_model.joblib

Training GradientBoosting...
GradientBoosting Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1086
           1       1.00      1.00      1.00      2154
           2       1.00      1.00      1.00      1609

    accuracy                           1.00      4849
   macro avg       1.00      1.00      1.00      4849
weighted avg       1.00      1.00      1.00      4849

✅ Saved model at: models/GradientBoosting_model

In [14]:
joblib.dump(best_model, "models/best_model.joblib")

print("\n✅ TRAINING COMPLETE")
print("Best Model:", best_model_name)
print("Best Accuracy:", best_score)
print("Saved as: models/best_model.joblib")



✅ TRAINING COMPLETE
Best Model: RandomForest
Best Accuracy: 1.0
Saved as: models/best_model.joblib
