In [None]:
import pandas as pd

# Load the new dataset
df = pd.read_csv('house_prices_srilanka.csv')

# See the first few rows
print(df.head())

# Check the data types and if any data is missing
print(df.info())

# See our location categories
print(df['Location'].value_counts())

In [None]:
# 1. Define your features (X) and target (y)
X_raw = df[['SquareFootage', 'Bedrooms', 'Location']]
y = df['Price']

# 2. This one simple command converts the 'Location' column
X_processed = pd.get_dummies(X_raw, columns=['Location'])

# 3. See the result!
print("--- Data Before Encoding ---")
print(X_raw.head())

print("\n--- Data After One-Hot Encoding ---")
print(X_processed.head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# 1. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# 2. Create and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 3. Check how well it did on the test data
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Model R-squared (R²): {r2:.3f}") 
# You should get a very high score (e.g., 0.90+) since this is synthetic data

In [None]:
import joblib
import json

# 1. Save the trained model
joblib.dump(model, 'sri_lanka_model.pkl')

# 2. Get the final list of columns
MODEL_COLUMNS = X_processed.columns.tolist()
print("Model columns:", MODEL_COLUMNS)

# 3. Save that list to a file
with open('model_columns.json', 'w') as f:
    json.dump(MODEL_COLUMNS, f)

print("Model and columns saved successfully!")