In [None]:
# Importing necessary libraries for data analysis and modeling
import matplotlib.pyplot as plt  # For data visualization
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import seaborn as sns  # For advanced data visualization
from sklearn import datasets, linear_model, metrics  # For machine learning models and metrics
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.preprocessing import OneHotEncoder, StandardScaler  # For data preprocessing
from sklearn.linear_model import LinearRegression  # For building a linear regression model

In [None]:
# Set the display option to show all columns when printing DataFrames
pd.set_option("display.max_columns", None)

In [None]:
# Read a CSV file located at the specified path and load it into a DataFrame 'df'
df = pd.read_csv("C:/Users/sampa/OneDrive/Desktop/sannidhi/Masai/Projects/Practo/data/model_df.csv")

In [None]:
# Display the first few rows of the DataFrame 'df'
df.head()

In [None]:
# Get the dimensions (number of rows and columns) of the DataFrame 'df'
df.shape

In [None]:
# Get the column names of the DataFrame 'df'
df.columns

In [None]:
# Create a dictionary to map new column names to existing column names
new_column_names = {
    'City_Bangalore': 'Bangalore',
    'City_Delhi': 'Delhi',
    'City_Mumbai': 'Mumbai'
}

In [None]:
# Rename columns in the DataFrame 'df' using the 'new_column_names' dictionary and apply changes in place
df.rename(columns=new_column_names, inplace=True)

In [None]:
# Calculate the count of missing values (NaN) for each column in the DataFrame 'df'
df.isnull().sum()

In [None]:
# List of column names representing numerical features
num = ['Year_of_experience', 'dp_score', 'npv']

In [None]:
# Create a StandardScaler instance for feature scaling
sc = StandardScaler()

In [None]:
# Use the StandardScaler 'sc' to standardize the numerical columns specified in 'num' in the DataFrame 'df'
df[num] = sc.fit_transform(df[num])

In [None]:
# Display the subset of the DataFrame 'df' containing only the numerical columns specified in 'num'
df[num]

In [None]:
# Splitting the DataFrame 'df' into features (X) and target variable (y)
X = df.drop("consultation_fee", axis=1)  # Features (all columns except 'consultation_fee')
y = df['consultation_fee']  # Target variable ('consultation_fee')

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15)

In [None]:
# Import necessary libraries for machine learning and regression modeling
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation
from sklearn.model_selection import train_test_split  # For splitting data into train and test sets
from sklearn.linear_model import LinearRegression, Ridge, Lasso  # For linear regression models
from sklearn.tree import DecisionTreeRegressor  # For decision tree regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor  # For ensemble regression models
from sklearn.svm import SVR  # For support vector regression
from sklearn.metrics import mean_squared_error, r2_score  # For regression performance evaluation
from xgboost import XGBRFRegressor  # For XGBoost-based regression
# from tensorflow.keras.models import Sequential  # For deep learning models (commented out)
# from tensorflow.keras.layers import LSTM, Dense  # For deep learning models (commented out)

In [None]:
# Create a dictionary 'mod' containing different regression models as values
mod = {
    'Linear Regression' : LinearRegression(),
    'Decision Tree Regressor' : DecisionTreeRegressor(),
    'Random Forest Regressor' : RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'Support Vector Regression' : SVR(),
    'Ridge Regression' : Ridge(alpha=0.5),
    'Lasso Regression' : Lasso(alpha=0.1),
    'XGBoost model' : XGBRFRegressor(n_estimators=100, random_state=42),
    'AdaBoost Regressor' : AdaBoostRegressor(n_estimators=100, random_state=42)
}

In [None]:
# Iterate through each regression model and evaluate its performance
for name, model in mod.items():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(name)
    print(f'mse: {mean_squared_error(y_test, y_pred)} rmse: {mean_squared_error(y_test, y_pred)**0.5}: r2_score: {r2_score(y_test, y_pred)}')
    print('***************************************')

## CV on AdaBoost 

In [None]:
param_grid = {
    'n_estimators': [200,300,400,500],
    'learning_rate': [0.01, 0.1, 1.0],
    'loss': ['linear', 'square', 'exponential']
}
base_estimator = DecisionTreeRegressor()

# Create the AdaBoost regressor
adaboost = AdaBoostRegressor(base_estimator=base_estimator)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=adaboost, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, error_score='raise')
grid_search.fit(X_train, y_train)


In [None]:
# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2S=r2_score(y_test,y_pred)
print("Best parameters:", best_params)
print("Best MSE:", mse)
print("R2 Score",r2S)

In [None]:
# Base models
linear_model = LinearRegression()
tree_model = DecisionTreeRegressor()
forest_model = RandomForestRegressor()
gradient_Boosting_model = GradientBoostingRegressor()
svr_model = SVR()
ridge_model = Ridge(alpha=0.5)
lasso_model = Lasso(alpha=0.1)
xgboost_model = XGBRFRegressor(n_estimators=100, random_state=42)
adaboost_model = AdaBoostRegressor(n_estimators=100, random_state=42)

## Ensebmle techniques before hyperparameter tunning

In [None]:
# Model Averaging
models = [linear_model, tree_model, forest_model,gradient_Boosting_model, svr_model, ridge_model, lasso_model, xgboost_model, adaboost_model ]
averaged_predictions = np.zeros_like(y_test)

for model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    averaged_predictions = averaged_predictions.astype('float64')
    averaged_predictions += predictions

averaged_predictions /= len(models)
averaged_mse = mean_squared_error(y_test, averaged_predictions)
averaged_r2 = r2_score(y_test, averaged_predictions)

print("Model Averaging MSE:", averaged_mse)
print("Model Averaging RMSE:", averaged_mse**0.5)
print("r2_score of Model Averaging", averaged_r2)

In [None]:
# Stacking
meta_model = LinearRegression()
stacked_predictions = np.zeros_like(y_test)

for model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    stacked_predictions = np.column_stack((stacked_predictions, predictions))

stacked_predictions = stacked_predictions[:, 1:]  # Remove the initial zeros column
meta_model.fit(stacked_predictions, y_test)
stacked_predictions = stacked_predictions.mean(axis=1)
stacked_mse = mean_squared_error(y_test, stacked_predictions)
stacked_r2 = r2_score(y_test, stacked_predictions)

print("Stacking MSE:", stacked_mse)
print("Sqrt of Stacking RMSE:", stacked_mse**0.5)
print("r2_score of Stacking:", stacked_r2)

In [None]:
# Weighted Voting
weights = [0.2, 0.35, 0.45]  # Adjust the weights as desired
weighted_predictions = np.zeros_like(y_test)
model1 = [linear_model,gradient_Boosting_model, xgboost_model]
for model, weight in zip(model1, weights):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    weighted_predictions = weighted_predictions.astype('float64')
    weighted_predictions += weight * predictions

weighted_mse = mean_squared_error(y_test, weighted_predictions)
weighted_r2 = r2_score(y_test, weighted_predictions)

print("Weighted Voting MSE:", weighted_mse)
print("Weighted Voting RMSE:", weighted_mse**0.5)
print("r2_score of Weighted Voting:", weighted_r2)

In [None]:
# Model Averaging only 3
models = [linear_model, tree_model, forest_model,gradient_Boosting_model, svr_model, ridge_model, lasso_model, xgboost_model]
averaged_predictions = np.zeros_like(y_test)

for model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    averaged_predictions = averaged_predictions.astype('float64')
    averaged_predictions += predictions

averaged_predictions /= len(models)
averaged_mse = mean_squared_error(y_test, averaged_predictions)
averaged_r2 = r2_score(y_test, averaged_predictions)

print("Model Averaging MSE:", averaged_mse)
print("Model Averaging RMSE:", averaged_mse**0.5)
print("r2_score of Model Averaging", averaged_r2)

## GridSearchCV

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Hyperparameter Tunning for each Models

## Lasso Model

In [None]:
# Create a Lasso regression model and specify hyperparameter tuning using GridSearchCV
lasso = Lasso()
parameters = {
    'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 0, 1, 5, 10, 20, 30, 35, 40, 45, 50, 54, 55, 56, 57, 58, 59, 60, 70, 75, 77, 80, 90, 95, 100]
}
lasso_regression = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv=10)

In [None]:
# Fit the Lasso regression model with hyperparameter tuning to the training data
lasso_regression.fit(X_train, y_train)

In [None]:
# Print the best hyperparameters, best score, and best estimator found during hyperparameter tuning
print("Best Hyperparameters:", lasso_regression.best_params_)
print("Best Negative Mean Squared Error:", lasso_regression.best_score_)
print("Best Estimator:", lasso_regression.best_estimator_)

In [None]:
# Create a Lasso regression model and specify hyperparameter tuning using GridSearchCV
lasso = Lasso()
parameters = {
    'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 0, 1, 5, 10, 20, 30, 35, 40, 45, 50, 54, 55, 56, 57, 58, 59, 60, 70, 75, 77, 80, 90, 95, 100]
}
lasso_regression = GridSearchCV(lasso, parameters, scoring='r2', cv=10)

In [None]:
# Fit the Lasso regression model with hyperparameter tuning to the training data
lasso_regression.fit(X_train, y_train)

# Print the best hyperparameters, best score, and best estimator found during hyperparameter tuning
print("Best Hyperparameters:", lasso_regression.best_params_)
print("Best R^2 Score:", lasso_regression.best_score_)
print("Best Estimator:", lasso_regression.best_estimator_)

# Create a Lasso regression model with a specific alpha value and fit it to the training data
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

# Make predictions on the test data using the Lasso model
y_pred_l = lasso_model.predict(X_test)

# Calculate the mean squared error between the predicted and actual values
mse = mean_squared_error(y_test, y_pred_l)

# Calculate the R^2 score between the predicted and actual values
r2 = r2_score(y_test, y_pred_l)

# Print the mean squared error and R^2 score
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

In [None]:
# Create a Lasso regression model and specify hyperparameter tuning using GridSearchCV
lasso = Lasso()
parameters = {'alpha': np.linspace(-3, 3, 20)}  # Alpha values ranging from -3 to 3
lasso_regression = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv=10)

# Fit the Lasso regression model with hyperparameter tuning to the training data
lasso_regression.fit(X_train, y_train)

In [None]:
# Print the best hyperparameters, best score, and best estimator found during hyperparameter tuning
print("Best Hyperparameters:", lasso_regression.best_params_)
print("Best Negative Mean Squared Error:", lasso_regression.best_score_)
print("Best Estimator:", lasso_regression.best_estimator_)

# Create a Lasso regression model with a specific alpha value (0.1) and fit it to the training data
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

# Make predictions on the test data using the Lasso model
y_pred_LaT = lasso_model.predict(X_test)

# Calculate the mean squared error between the predicted and actual values
mse = mean_squared_error(y_test, y_pred_LaT)

# Calculate the R^2 score between the predicted and actual values
r2 = r2_score(y_test, y_pred_LaT)

# Print the mean squared error and R^2 score
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

In [None]:
# Create empty lists to store training and testing scores
train_scores = []
test_scores = []

# Generate a range of alpha values from -3 to 3 with 20 values
alphas = np.linspace(-3, 3, 20)

# Display the alpha values
alphas

In [None]:

# Define a range of alpha values to explore
alphas = np.logspace(-3, 3, num=20)

# Create empty lists to store the R2 scores for training and test sets
train_scores = []
test_scores = []

# Iterate over the alpha values, fit the model, and calculate the R2 scores
for alpha in alphas:
    model = Lasso(alpha = alpha)  # Use your desired model
    model.fit(X_train, y_train)

    # Calculate R2 scores for training and test sets
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    train_score = r2_score(y_train, train_pred)
    test_score = r2_score(y_test, test_pred)

    # Append scores to the lists
    train_scores.append(train_score)
    test_scores.append(test_score)

# Plot the R2 scores versus alpha
plt.figure(figsize=(10, 6))
plt.plot(alphas, train_scores, label='Training Set')
plt.plot(alphas, test_scores, label='Test Set')
plt.xlabel('Alpha')
plt.ylabel('R2 Score')
#plt.xscale('log')
plt.legend()
plt.title('R2 Score vs. Alpha')
plt.show()


## CV on Random Forest Model

In [None]:
random_forest =RandomForestRegressor()
# parameters={'n_estimators':[100, 200 , 300], 'max_depth' : [3,5,7], 'criterion' : ['mse', 'mae']}
parameters={'n_estimators':[100, 200 , 300], 'max_depth' : [3,5,7]}
random_forest_regression=GridSearchCV(random_forest,parameters,scoring='r2',cv=5)

In [None]:
random_forest_regression.fit(X_train,y_train)

In [None]:
print(random_forest_regression.best_params_)
print(random_forest_regression.best_score_)
print(random_forest_regression.best_estimator_)

In [None]:
random_forest_model = RandomForestRegressor(n_estimators=200, max_depth=7)

In [None]:
random_forest_model.fit(X_train,y_train)

In [None]:
y_pred_r = random_forest_model.predict(X_test)

In [None]:
mean_squared_error(y_test,y_pred_r )

In [None]:
r2_score(y_test,y_pred_r)

In [None]:
#without passing any parameters

In [None]:
random_forest_model2 = RandomForestRegressor()

In [None]:
random_forest_model2.fit(X_train,y_train)

In [None]:
y_pred_r2 = random_forest_model2.predict(X_test)

In [None]:
mean_squared_error(y_test,y_pred_r2)

In [None]:
r2_score(y_test,y_pred_r2)

In [None]:
# Define the hyperparameters to tune and their ranges
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [ 5, 10],
    'min_samples_split': [2, 5, 10]
}


In [None]:
train_scores = []
test_scores = []


In [None]:
# Iterate over the hyperparameter combinations, fit the Random Forest Regressor, and calculate the R2 scores
for params in param_grid.values():
    for param_value in params:
        # Create a dictionary with the current hyperparameter value
        param_dict = {list(param_grid.keys())[0]: param_value}

        # Create the Random Forest Regressor with the specified hyperparameters
        model = RandomForestRegressor(**param_dict)
        model.fit(X_train, y_train)

        # Predict on the training and test sets
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)

        # Calculate the R2 scores for training and test sets
        train_score = r2_score(y_train, train_pred)
        test_score = r2_score(y_test, test_pred)

        # Append the scores to the respective lists
        train_scores.append(train_score)
        test_scores.append(test_score)


In [None]:

# Plot the R2 scores versus the corresponding hyperparameter values
param_values = range(len(train_scores))

plt.figure(figsize=(10, 6))
plt.plot(param_values, train_scores, label='Training Set')
plt.plot(param_values, test_scores, label='Test Set')
plt.xlabel('Hyperparameter Combination')
plt.ylabel('R2 Score')
plt.legend()
plt.title('R2 Score vs. Hyperparameter Combination')
plt.xticks(param_values, rotation='vertical')
plt.show()

## CV on Ridge

In [None]:
ridge=Ridge()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,0,1,5,10,20,30,35,40,45,50,54,55,56,57,58,59,60,70,75,77,80,90,95,100]}
ridge_regression=GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=10)

In [None]:
ridge_regression.fit(X_train,y_train)

In [None]:
print(ridge_regression.best_params_)
print(ridge_regression.best_score_)
print(ridge_regression.best_estimator_)

In [None]:
ridge_model =Ridge(alpha = 5)

In [None]:
ridge_model.fit(X_train, y_train)

In [None]:
y_pred_ri = ridge_model.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred_ri)

In [None]:
r2_score(y_test, y_pred_ri)

In [None]:
# Base models
forest_model = RandomForestRegressor()
ridge_model = Ridge(alpha=5)
lasso_model = Lasso(alpha=0.1)

In [None]:
# Model Averaging
models = [forest_model, ridge_model, lasso_model]
averaged_predictions = np.zeros_like(y_test)

for model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    averaged_predictions = averaged_predictions.astype('float64')
    averaged_predictions += predictions

averaged_predictions /= len(models)
averaged_mse = mean_squared_error(y_test, averaged_predictions)
averaged_r2 = r2_score(y_test, averaged_predictions)

print("Model Averaging MSE:", averaged_mse)
print("Model Averaging RMSE:", averaged_mse**0.5)
print("r2_score of Model Averaging", averaged_r2)

In [None]:
# Stacking
meta_model = LinearRegression()
stacked_predictions = np.zeros_like(y_test)
models = [forest_model, ridge_model, lasso_model]

for model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    stacked_predictions = np.column_stack((stacked_predictions, predictions))

stacked_predictions = stacked_predictions[:, 1:]  # Remove the initial zeros column
meta_model.fit(stacked_predictions, y_test)
stacked_predictions = stacked_predictions.mean(axis=1)
stacked_mse = mean_squared_error(y_test, stacked_predictions)
stacked_r2 = r2_score(y_test, stacked_predictions)

print("Stacking MSE:", stacked_mse)
print("Sqrt of Stacking RMSE:", stacked_mse**0.5)
print("r2_score of Stacking:", stacked_r2)

In [None]:
# Weighted Voting
weights = [0.7, 0.2, 0.1]  # Adjust the weights as desired
weighted_predictions = np.zeros_like(y_test)
models = [forest_model, lasso_model, ridge_model ]
for model, weight in zip(models, weights):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    weighted_predictions = weighted_predictions.astype('float64')
    weighted_predictions += weight * predictions

weighted_mse = mean_squared_error(y_test, weighted_predictions)
weighted_r2 = r2_score(y_test, weighted_predictions)

print("Weighted Voting MSE:", weighted_mse)
print("Weighted Voting RMSE:", weighted_mse**0.5)
print("r2_score of Weighted Voting:", weighted_r2)

In [None]:
forest_model = RandomForestRegressor()
ridge_model = Ridge(alpha=5)
lasso_model = Lasso(alpha=0.1)

In [None]:
import pickle
from sklearn.linear_model import LogisticRegression

In [None]:
# Save the trained model using Pickle
with open('trained_RandomForest.pickle', 'wb') as file:
    pickle.dump(forest_model, file)

In [None]:
# Save the trained model using Pickle
with open('trained_Ridge_model.pickle', 'wb') as file:
    pickle.dump(ridge_model, file)

In [None]:
# Save the trained model using Pickle
with open('trained_Lasso_model.pickle', 'wb') as file:
    pickle.dump(lasso_model, file)

In [None]:
# Save the trained model using Pickle
with open('trained_Sc.pickle', 'wb') as file:
    pickle.dump(sc, file)