### Step-1. Importing Libraries:

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

### Step-2. Load and Preprocess the DataSet

In [3]:
# Load dataset
df = pd.read_csv('dataSet.csv')

# Select only numerical features: Age and EstimatedSalary
# X = dataset.iloc[:, 2:4].values
# y = dataset.loc[:, 'Expected_Price_Lakhs'].values

#check for any null and drop null values
df.isnull().sum()
df.dropna(axis=0,inplace=True)

### Step-3. Encoding Categorical Values

In [14]:
from sklearn.preprocessing import LabelEncoder

cat_cols = df.select_dtypes(include='object').columns
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Save the label encoders for Flask to use
import joblib
joblib.dump(label_encoders, "label_encoders.joblib")

['label_encoders.joblib']

### Step-4. Select Features and Target

In [15]:
X = df.drop("Expected_Price_Lakhs", axis=1)
y = df["Expected_Price_Lakhs"]

### Step-5. Split into Train and Test

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 ,random_state = 0)

In [17]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

In [18]:
#predictions
y_pred = model.predict(X_test)

### Step-6. Evalute Model

In [9]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}, R2 Score: {r2:.3f}")

MSE: 4225.61, R2 Score: 0.849


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Smaller - to make it faster
param_grid = {
    'n_estimators': [100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=0),
    param_grid,
    scoring='r2',
    cv=3,
    n_jobs=-1
)

# Fit grid_search on the training data
grid_search.fit(X_train, y_train)

# Use the best model found
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("Tuned R2 Score:", r2_score(y_test, y_pred_rf))
# Calculate RMSE by taking the square root of MSE
print("Tuned MSE:", mean_squared_error(y_test, y_pred_rf))

Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Tuned R2 Score: 0.8561115521486888
Tuned MSE: 4034.801221969335


### Step-7 Cross Validation

In [11]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, scoring='r2', cv=5)
print("Avg R2 score:", scores.mean())

Avg R2 score: 0.8645880449906984


### Step-8 Saving model

In [12]:
import joblib

# file = "mdl.joblib"
# joblib.dump(model, file)

In [13]:
joblib.dump(label_encoders, "label_encoders.joblib")


['label_encoders.joblib']