## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# reading the data from the file
df = pd.read_excel('D:\\Programming\\GUVI PROJECTS CODE\\PROJECT - 3\\Data Preprocessing & Cleaning\\all_city.xlsx')

## Model Selection

In [2]:
X = df.drop('price',axis=1)         # features
y = df['price']                     # target

# splitting the data into training and testing data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# dictionary of models to be used for training
models = {  
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
}

## Model Training

In [None]:
# Model Training with Cross-validation
results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    results[name] = scores.mean()

print("Cross-Validation Results:")
for model_name, cv_score in results.items():
    print(f"{model_name}: {cv_score:.4f} (R^2)")

## Model Comparison

In [None]:
# Comparing the models based on evaluation metrics to select the best performing model
models = ['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor', 'Gradient Boosting Regressor']

lr = LinearRegression().fit(X_train, y_train)
dt = DecisionTreeRegressor().fit(X_train, y_train)
rf = RandomForestRegressor().fit(X_train, y_train)
gb = GradientBoostingRegressor().fit(X_train, y_train)


mae = [mean_absolute_error(y_test, lr.predict(X_test)),
    mean_absolute_error(y_test, dt.predict(X_test)),
    mean_absolute_error(y_test, rf.predict(X_test)),
    mean_absolute_error(y_test, gb.predict(X_test))]

mse = [mean_squared_error(y_test, lr.predict(X_test)),
    mean_squared_error(y_test, dt.predict(X_test)),
    mean_squared_error(y_test, rf.predict(X_test)),
    mean_squared_error(y_test, gb.predict(X_test))]

r2 = [r2_score(y_test, lr.predict(X_test)),
      r2_score(y_test, dt.predict(X_test)),
      r2_score(y_test, rf.predict(X_test)),
      r2_score(y_test, gb.predict(X_test))]

comparison_df = pd.DataFrame({'Model': models, 'MAE': mae, 'MSE': mse, 'R2 Score': r2})
comparison_df


## Optimization

### Hyperparameter Tuning

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter distribution
param_dist = {
    'n_estimators': np.arange(50, 301, 50),  # Number of trees
    'max_depth': [None] + list(np.arange(10,31, 5)),  # Maximum depth of trees
    'min_samples_split': np.arange(2, 5, 10),  # Minimum samples required to split an internal node
    'min_samples_leaf': np.arange(1, 2, 4),  # Minimum samples required at a leaf node
    'max_features': ['auto', 'sqrt'],  # Number of features to consider for best split
    'bootstrap': [True, False]  # Whether to use bootstrap samples
}

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(),
    param_distributions=param_dist,
    n_iter=20,  # Number of parameter settings that are sampled
    cv=3,  # 3-fold cross-validation
    verbose=2,  # Control the verbosity
    random_state=42,  # For reproducibility
    n_jobs=-1  # Use all available cores
)

try:
    # Fit RandomizedSearchCV
    random_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = random_search.best_params_
    print(f"Best parameters: {best_params}")

    # Evaluate the best model
    best_rf_model = random_search.best_estimator_

    # Make predictions on the test set
    y_pred = best_rf_model.predict(X_test)

    # Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Squared Error (MSE) of the best model: {mse}")
    print(f"R-squared (R²) of the best model: {r2}")
except Exception as e:
    print(f"An error occurred: {e}")

### Feature Engineering

In [None]:
# As we can see Random Forest performs better than others so we choose that model

# Drop low-importance features based on your domain knowledge and feature importance scores
X_optimized = X.drop(columns=['Fuel type', 'Ownership details', 'ownerNo', 'Insurance Validity', 'Body type'])

importances = rf.feature_importances_
feature_importance = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print(feature_importance)


In [6]:
# Regularization

In [7]:
# # Saving the model into a pickle file
# import pickle
# with open('D:\\Programming\\GUVI PROJECTS CODE\\PROJECT - 3\\Model\\model.pkl', 'wb') as f:
#     pickle.dump(rf, f)