In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

file_path = 'Encoded.csv'
data = pd.read_csv(file_path)








X = data.drop('price_in_lakhs', axis=1)
y = data['price_in_lakhs']




X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'Lasso': Lasso(),
    'Ridge': Ridge()
}


param_grids = {
    'Lasso': {'alpha': [0.001, 0.01, 0.1, 1, 10]},
    'Ridge': {'alpha': [0.001, 0.01, 0.1, 1, 10]}
}

best_model = None
best_mse = float('inf')
best_model_name = None

for name, model in models.items():
    if name in param_grids:
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model_ = grid_search.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_model_ = model
    
    predictions = best_model_.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"{name} - Mean Squared Error: {mse:.4f}")

   
    if mse < best_mse:
        best_mse = mse
        best_model = best_model_
        best_model_name = name


with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print(f"Best model ({best_model_name}) saved with MSE: {best_mse:.4f}")

Linear Regression - Mean Squared Error: 149.0106
Decision Tree - Mean Squared Error: 154.3180
Random Forest - Mean Squared Error: 107.8171
Gradient Boosting - Mean Squared Error: 109.7345
XGBoost - Mean Squared Error: 90.1803
Lasso - Mean Squared Error: 149.0128
Ridge - Mean Squared Error: 149.0106
Best model (XGBoost) saved with MSE: 90.1803
