In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np
import joblib

In [9]:
# Load dataset
df = pd.read_csv("Land_price_estimation.csv")
df.head()

Unnamed: 0,Plot_Number,Location,Size (sqm),Price (RWF),Zoning,Nearby Amenities,Distance to City Center (km),Land Type,Price per sqm (RWF)
0,Plot_1,Huye,351,6823440,Commercial,"School, Market",5,Residential,19440.0
1,Plot_2,Ngoma,478,6711120,Commercial,"Hospital, University",7,Residential,14040.0
2,Plot_3,Tumba,1334,27373680,Commercial,"Market, University",1,Residential,20520.0
3,Plot_4,Rukira,869,5474700,Residential,"School, Hospital",13,Agricultural,6300.0
4,Plot_5,Huye,740,11988000,Agricultural,"School, Hospital",2,Residential,16200.0


In [10]:
# Drop unused columns
df = df.drop(columns=["Plot_Number", "Price per sqm (RWF)"])

# Merge 'Zoning' and 'Land Type' into one column
df['Zoning_LandType'] = df['Zoning'].astype(str).str.strip() + " - " + df['Land Type'].astype(str).str.strip()
df = df.drop(columns=["Zoning", "Land Type"])

# Drop rows with missing values (or you can impute)
df = df.dropna()
print(f"✅ Cleaned dataset shape: {df.shape}")

✅ Cleaned dataset shape: (1000, 6)


In [11]:
# Define features and target
X = df.drop(columns=["Price (RWF)"])
y = df["Price (RWF)"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Identify columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(exclude='object').columns.tolist()

# Preprocessors
numerical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# Models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}


In [13]:
results = []

for name, model in models.items():
    print(f"\n🔧 Training {name}...")
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    cv_scores = cross_val_score(pipeline, X_train, y_train, scoring='r2', cv=5)
    
    results.append({
        "Model": name,
        "R2_Score": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "CV_Mean": cv_scores.mean(),
        "CV_Std": cv_scores.std()
    })

results_df = pd.DataFrame(results).sort_values(by="R2_Score", ascending=False)
print("\n📊 Model Comparison:")
print(results_df.to_string(index=False))


🔧 Training Linear Regression...

🔧 Training Random Forest...

🔧 Training XGBoost...

📊 Model Comparison:
            Model  R2_Score          MAE         RMSE  CV_Mean   CV_Std
          XGBoost  0.925026 1.575433e+06 2.243948e+06 0.909966 0.011043
    Random Forest  0.907022 1.802510e+06 2.498892e+06 0.900794 0.009958
Linear Regression  0.875425 2.192825e+06 2.892488e+06 0.850834 0.012129


In [15]:
# Save best model
best_model_name = results_df.iloc[0]["Model"]
print(f"\n🏆 Best Model: {best_model_name}")

best_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', models[best_model_name])
])
best_pipeline.fit(X_train, y_train)
joblib.dump(best_pipeline, "best_model.pkl")
print("✅ Model saved as 'best_model.pkl'")



🏆 Best Model: XGBoost
✅ Model saved as 'best_model.pkl'


In [16]:
# Load original dataset
df_orig = pd.read_csv("Land_price_estimation.csv")
df_orig['Zoning_LandType'] = df_orig['Zoning'].astype(str).str.strip() + " - " + df_orig['Land Type'].astype(str).str.strip()
X_full = df_orig.drop(columns=["Plot_Number", "Price per sqm (RWF)", "Price (RWF)", "Zoning", "Land Type"])

# Predict
model = joblib.load("best_model.pkl")
predictions = model.predict(X_full)

# Add predictions
df_orig['Predicted Price (RWF)'] = predictions

# Show results
print("\n📄 Sample Predictions vs Actual:")
print(df_orig[["Price (RWF)", "Predicted Price (RWF)"]].head(10))

# Save to CSV
df_orig.to_csv("land_price_with_predictions.csv", index=False)
print("✅ Predictions saved to 'land_price_with_predictions.csv'")



📄 Sample Predictions vs Actual:
   Price (RWF)  Predicted Price (RWF)
0      6823440             6881351.50
1      6711120             6698110.50
2     27373680            27322616.00
3      5474700             5465725.50
4     11988000            11880898.00
5      7741440             7739233.00
6      4978800             5295048.50
7     15630300            15315454.00
8      3910500             4135653.75
9      9630720             9506923.00
✅ Predictions saved to 'land_price_with_predictions.csv'


In [17]:
import joblib

# Load your trained model
model = joblib.load("best_model.pkl")

# Try to get feature names from the ColumnTransformer inside the pipeline
preprocessor = model.named_steps['preprocessor']

# Get all final feature names (after preprocessing)
try:
    feature_names = preprocessor.get_feature_names_out()
    print("✅ Model was trained with these features:\n", feature_names)
except AttributeError:
    print("⚠️ Could not extract feature names (you may be using older sklearn version).")


✅ Model was trained with these features:
 ['num__Size (sqm)' 'num__Distance to City Center (km)'
 'cat__Location_Huye' 'cat__Location_Ngoma' 'cat__Location_Rukira'
 'cat__Location_Tumba' 'cat__Nearby Amenities_Hospital, University'
 'cat__Nearby Amenities_Market, University'
 'cat__Nearby Amenities_School, Hospital'
 'cat__Nearby Amenities_School, Market'
 'cat__Zoning_LandType_Agricultural - Agricultural'
 'cat__Zoning_LandType_Agricultural - Commercial'
 'cat__Zoning_LandType_Agricultural - Residential'
 'cat__Zoning_LandType_Commercial - Agricultural'
 'cat__Zoning_LandType_Commercial - Commercial'
 'cat__Zoning_LandType_Commercial - Residential'
 'cat__Zoning_LandType_Residential - Agricultural'
 'cat__Zoning_LandType_Residential - Commercial'
 'cat__Zoning_LandType_Residential - Residential']
