In [None]:
# Q1. What is Gradient Boosting Regression?
# Answer :-
# Gradient Boosting Regression is a machine learning technique used for regression problems, which involve predicting a continuous outcome variable. It is an ensemble learning method that combines the predictions of multiple weak learners, typically decision trees, to create a strong predictive model.
# Here's a breakdown of how Gradient Boosting Regression works:
# Weak Learners (Base Models): Gradient Boosting builds an additive model by sequentially adding weak learners to the ensemble. A weak learner is a model that performs slightly better than random chance. Decision trees are often used as weak learners in gradient boosting.
# Sequential Training: The algorithm starts by fitting a weak learner to the original data. It then fits additional models to the residuals (the differences between the predicted and true values) of the previous model. Each new model focuses on correcting the errors made by the combined ensemble of the existing models.
# Gradient Descent Optimization: The term "gradient" in Gradient Boosting refers to the gradient of the loss function with respect to the model's predictions. In each iteration, the algorithm calculates the gradient of the loss function for the current ensemble of models and adjusts the new model to move in the direction that minimizes the loss.
# Shrinkage (Learning Rate): To prevent overfitting and control the contribution of each weak learner, a shrinkage parameter (also known as the learning rate) is introduced. It scales the contribution of each weak learner to the ensemble. A lower learning rate requires more weak learners to fit the data but often leads to a more robust model.
# Stopping Criteria: The process of adding weak learners is repeated until a specified number of models are added or until a certain level of performance is achieved. Stopping criteria help prevent overfitting.
# Gradient Boosting Regression is known for its high predictive accuracy and is widely used in various applications, including finance, healthcare, and natural language processing. Popular implementations include XGBoost, LightGBM, and scikit-learn's GradientBoostingRegressor.


In [None]:
# Q2. Implement a simple gradient boosting algorithm from scratch using Python and NumPy. Use a
# simple regression problem as an example and train the model on a small dataset. Evaluate the model's
# performance using metrics such as mean squared error and R-squared.
# Answer :-
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

class GradientBoostingRegressor:
    def __init__(self, n_estimators=100, learning_rate=0.1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.models = []

    def fit(self, X, y):
        # Initial prediction is the mean of the target variable
        initial_prediction = np.mean(y)
        prediction = np.full_like(y, initial_prediction)

        for _ in range(self.n_estimators):
            # Calculate residuals
            residuals = y - prediction

            # Fit a weak learner (Decision Tree) to the residuals
            model = DecisionTreeRegressor(max_depth=3)
            model.fit(X, residuals)

            # Update the prediction with the learning rate
            prediction += self.learning_rate * model.predict(X)

            # Save the model
            self.models.append(model)

    def predict(self, X):
        # Make predictions by summing the predictions of all weak learners
        return np.sum(self.learning_rate * model.predict(X) for model in self.models)

# Generate a small dataset for demonstration
np.random.seed(42)
X = np.random.rand(100, 1)
y = 3 * X.squeeze() + np.random.randn(100)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the gradient boosting regressor
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
gb_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = np.array([gb_regressor.predict(np.array([x]).reshape(1, -1)) for x in X_test])

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


In [None]:
# Q3. Experiment with different hyperparameters such as learning rate, number of trees, and tree depth to
# optimise the performance of the model. Use grid search or random search to find the best
# hyperparameters.
# Answer :-
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Generate a small dataset for demonstration
np.random.seed(42)
X = np.random.rand(100, 1)
y = 3 * X.squeeze() + np.random.randn(100)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the gradient boosting regressor
gb_regressor = GradientBoostingRegressor()

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
}

# Define the scoring function
scoring = {
    'MSE': make_scorer(mean_squared_error),
    'R2': make_scorer(r2_score),
}

# Perform random search
random_search = RandomizedSearchCV(
    gb_regressor, param_distributions=param_grid, n_iter=10, scoring=scoring, refit='MSE', cv=3, random_state=42
)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:")
print(random_search.best_params_)

# Evaluate the model with the best hyperparameters
y_pred = random_search.best_estimator_.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


In [None]:
# Q4. What is a weak learner in Gradient Boosting?
# Answer :-

# In the context of Gradient Boosting, a weak learner refers to a model that performs slightly better than random chance on a given task. In the case of regression problems, weak learners are typically shallow decision trees. For classification problems, weak learners are often weak classifiers, which are only slightly better than random guessing.

# The term "weak" does not imply that the model is inherently poor; rather, it means that the model's performance is just above chance level. Weak learners are used in the context of ensemble learning, where the goal is to combine the predictions of multiple weak learners to create a strong, high-performance model.

# The strength of Gradient Boosting lies in its ability to sequentially train these weak learners and focus each new model on the mistakes (residuals) made by the existing ensemble. By combining the predictions of these weak learners in a weighted manner, the overall model becomes a powerful predictor, capable of capturing complex relationships in the data.

# The choice of weak learners, such as decision trees with limited depth, helps prevent overfitting to the training data and contributes to the generalization ability of the overall ensemble model. Each weak learner contributes a small piece of the solution, and their combination forms a robust and accurate predictive model.

In [None]:
# Q5. What is the intuition behind the Gradient Boosting algorithm?
# Answer :-

# The Gradient Boosting algorithm is based on the principle of building a strong predictive model by combining the predictions of multiple weak learners in a sequential manner. The intuition behind Gradient Boosting can be broken down into several key concepts:

# Sequential Training: Gradient Boosting builds an additive model in a sequential manner. It starts with an initial prediction, often the mean of the target variable for regression problems. Subsequent weak learners are then trained to correct the errors (residuals) made by the existing ensemble of models.

# Gradient Descent Optimization: The "gradient" in Gradient Boosting refers to the gradient of the loss function with respect to the model's predictions. In each iteration, the algorithm calculates the negative gradient of the loss function and fits a weak learner to the residuals. This means the new model is trained to move the predictions of the ensemble in the direction that minimizes the loss.

# Combining Weak Learners: Each weak learner is added to the ensemble with a certain weight, and its contribution is scaled by a learning rate. The learning rate controls the step size in the optimization process and helps prevent overfitting. The final prediction is the sum of the predictions from all the weak learners, each multiplied by its associated weight.

# Preventing Overfitting: The use of shallow decision trees as weak learners helps prevent overfitting. Shallow trees have limited depth, making them less prone to capturing noise in the training data and more focused on the underlying patterns.

# Robustness and Generalization: The ensemble nature of Gradient Boosting allows it to be robust and generalize well to new, unseen data. By iteratively refining the predictions and learning from the mistakes of the previous models, Gradient Boosting can capture complex relationships in the data.

# Hyperparameter Tuning: Important hyperparameters include the number of weak learners (trees), the learning rate, and the depth of the trees. Tuning these hyperparameters is crucial for achieving optimal performance and avoiding overfitting or underfitting.

In [None]:
# Q6. How does Gradient Boosting algorithm build an ensemble of weak learners?
# Answer :-

# The Gradient Boosting algorithm builds an ensemble of weak learners in a sequential manner. The process can be summarized in the following steps:

# Initialize the Ensemble:

# The algorithm starts with an initial prediction, often the mean of the target variable for regression problems.
# The initial prediction serves as the starting point for building the ensemble.
# Calculate Residuals:

# The residuals are the differences between the actual target values and the current prediction of the ensemble.
# In the first iteration, the residuals are simply the differences between the actual target values and the initial prediction.
# Train a Weak Learner:

# A weak learner, typically a shallow decision tree, is trained on the dataset. The goal of this weak learner is to capture the patterns in the residuals, i.e., the mistakes made by the current ensemble.
# The weak learner is fitted to the dataset using the current residuals as the target variable.
# Update the Ensemble:

# The predictions of the weak learner are added to the ensemble with a certain weight.
# The weight is determined by a factor, usually the learning rate, which controls the contribution of each weak learner to the overall ensemble.
# Update Predictions:

# The predictions of the ensemble are updated by adding the weighted predictions of the latest weak learner.
# The ensemble is now better aligned with the actual target values, as it has learned from the mistakes of the previous models.
# Repeat the Process:

# Steps 2-5 are repeated for a specified number of iterations or until a certain level of performance is achieved.
# In each iteration, a new weak learner is trained to correct the errors made by the existing ensemble.
# Final Prediction:

# The final prediction is the sum of the predictions from all weak learners in the ensemble, each multiplied by its associated weight.
# The process of sequentially adding weak learners continues until the algorithm reaches the specified number of iterations. The result is an ensemble of weak learners, where each learner contributes a small piece of the solution, and their combination forms a robust and accurate predictive model. The key idea is that each new weak learner corrects the errors made by the existing ensemble, gradually improving the overall model's predictive performance.

In [None]:
# Q7. What are the steps involved in constructing the mathematical intuition of Gradient Boosting
# algorithm?
# Answer :-
# Constructing the mathematical intuition behind the Gradient Boosting algorithm involves understanding the optimization process that aims to minimize a loss function. Below are the steps involved in developing the mathematical intuition for Gradient Boosting:

# Initialize with a Constant Prediction:

# Let's denote the initial prediction as 
# F0 (x), which is often the mean of the target variable.
# The goal is to iteratively improve this prediction by adding weak learners.
# Compute Residuals:

# Calculate the residuals, denoted as r i, which are the differences between the true target values (yi) and the current prediction (

# Ft(xi)).
# In the initial iteration (t=0),ri=yi−F0(xi).
# Train a Weak Learner to Fit Residuals:

# Fit a weak learner (often a shallow decision tree) to the residuals. Denote the weak learner's prediction as 
# ht(xi), where ht is the weak learner at iteration t.
# The weak learner is trained to minimize a loss function, typically the mean squared error: 

# L(yi,F t−1 (xi)+ht(xi))

# Update the Model:

# Add the weak learner's prediction to the ensemble with a weight (learning rate) denoted as 

# Ft(xi)=F t−1(xi)+ηht(xi).
# Repeat Steps 2-4:

# Repeat the process for a specified number of iterations or until a stopping criterion is met.
# In each iteration, a new weak learner is trained to fit the residuals of the current ensemble.
# Final Model:

# The final predictive model is the sum of all weak learners: 

# F(x)=F0 (x)+∑ t=1 T ηh t(x), where 

# T is the total number of iterations.
# Optimization Objective:

# The overall objective is to minimize the loss function over the training data. The loss function is often a measure of the difference between the true values and the predictions: 

# argmin 
# F ∑ i=1 N L(yi,F(xi)).
# Learning Rate and Regularization:

# The learning rate (η) controls the step size during optimization. Smaller values prevent overfitting but require more iterations.
# Regularization techniques, such as limiting the depth of weak learners, help prevent overfitting.
# Prediction:
# To make predictions on new data, the final model 

# F(x) is used.