In [1]:
import os

# Step 1: Create folder if it doesn't exist
os.makedirs("ml", exist_ok=True)


In [2]:
# Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib
import mlflow

In [3]:
# Step 2: Load dataset
df = pd.read_csv("D:/Semester6/ML/Housing.csv")
print(df.head())

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  


In [4]:
# Step 3: Encode categorical columns
cat_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating',
            'airconditioning', 'prefarea', 'furnishingstatus']

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [5]:
# Step 4: Define X and y
X = df.drop("price", axis=1)
y = df["price"]

In [6]:
# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Step 6: MLflow Tracking
mlflow.set_experiment("HousePricePrediction")
with mlflow.start_run():
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2_score", r2)

In [8]:
import joblib
import mlflow

# Step 2: Save the model
joblib.dump(model, "ml/model.pkl")

# Step 3: Log with MLflow (optional)
mlflow.log_artifact("ml/model.pkl")

