In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

In [9]:
data = load_iris()
# The Iris dataset contains measurements of sepal length, sepal width, petal length, and petal width for three species of iris flowers: setosa, versicolor, and virginica.
# The goal is to predict the species (target) based on these measurements (data). And the target here is to predict which flower it is based on width and length

x,y = data.data, data.target

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

Training the gradient boosting classifier

In [12]:
model =  GradientBoostingClassifier()
model.fit(x_train, y_train)

After training, we use the model to make predictions and evaluate its performance.

In [14]:
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

print('Classificaiton Report')
print(classification_report(y_test, y_pred))
# The 100% accuracy is possible because the Iris dataset is relatively easy to classify due to distinct features between species, and Gradient Boosting is a powerful model.

Accuracy: 1.0
Confusion Matrix
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Classificaiton Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



# Loading and Exploring the California Housing Dataset

We will use the California Housing dataset for regression tasks. The dataset contains features about housing in California, such as the number of rooms, population, and median income, and the target variable is the median house value.

In [15]:
housing = fetch_california_housing()
x = housing.data
y = housing.target

Splitting the Dataset

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

Training the Gradient Boosting Regressor

In [20]:
model_reg = GradientBoostingRegressor()
model_reg.fit(x_train, y_train)

In [23]:
y_pred_reg = model_reg.predict(x_test)
print(y_pred_reg)

mse = mean_squared_error(y_test, y_pred_reg)
print(f"Mean Squared Error: {mse}")

[0.50518761 1.09334601 4.24570956 ... 4.68181295 0.85329537 1.96275219]
Mean Squared Error: 0.29407876957513357
