# Fit the model/algorithm on the data and use it to make predictions

## Fit the model on the data

Different names for:

X = features, features variables, data

y = labels, targets, target variables

In [5]:
import pandas as pd

heart_disease = pd.read_csv("../data/heart-disease.csv")
heart_disease.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
from sklearn.model_selection import train_test_split
# Split the data into features and target variable
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the model to the training data
model.fit(X_train, y_train)
# Score the model on the test data (use the patterns the model has learned from training data)
# Compare the model's predictions to the actual labels in the test set and calculate accuracy
score = model.score(X_test, y_test)
print(f"Random Forest Classifier Model accuracy: {score:.2f}")

Random Forest Classifier Model accuracy: 0.84


## Make predictions using a maching learning model

There are two ways to predict:

1. predict()
2. predict_proba()

In [8]:
# Make the predications on the test data
predictions = model.predict(X_test)
print("Predictions on the test set:" + str(predictions[:100]))
# Create a DataFrame to compare actual vs predicted values
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print(comparison_df.head())

Predictions on the test set:[0 1 1 0 1 1 1 0 0 0 1 0 1 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0]
     Actual  Predicted
179       0          0
228       0          1
111       1          1
246       0          0
60        1          1


In [9]:
import numpy as np
# Calculate the mean of the predictions
predictions_mean = np.mean(predictions == y_test)  # This gives the accuracy as a proportion of correct predictions
print(f"Mean of predictions (accuracy as proportion): {predictions_mean}")
# Calculate the accuracy score using sklearn's accuracy_score function
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy score using accuracy_score function: {accuracy}")

Mean of predictions (accuracy as proportion): 0.8360655737704918
Accuracy score using accuracy_score function: 0.8360655737704918


In [10]:
# Make predictions using predict_proba to get probabilities of each class
predictions_proba = model.predict_proba(X_test[:5])
print("Predicted probabilities for the first 5 instances:")
print(predictions_proba)
print("predictions:", model.predict(X_test[:5]))  # Display the first 5 predictions

Predicted probabilities for the first 5 instances:
[[0.95 0.05]
 [0.38 0.62]
 [0.45 0.55]
 [0.86 0.14]
 [0.21 0.79]]
predictions: [0 1 1 0 1]


`predict()` can also be used for regression models.

In [11]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
# Fetch the California housing dataset
housing = fetch_california_housing()
housing.feature_names, housing.data.shape, housing.target_names, housing.target.shape
# Create a DataFrame from the housing data
housing_df = pd.DataFrame(housing.data, columns=housing.feature_names)
# Add the target variable to the DataFrame
housing_df['target'] = housing.target
# Display the first few rows of the DataFrame
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
from sklearn.ensemble import RandomForestRegressor
# Split the data into features and target variable
X_housing = housing_df.drop('target', axis=1)
y_housing = housing_df['target']
# Split the data into training and testing sets
X_train_housing, X_test_housing, y_train_housing, y_test_housing = train_test_split(X_housing, y_housing, test_size=0.2, random_state=42)
# Create and fit the Random Forest Regressor model
housing_model = RandomForestRegressor(n_estimators=100, random_state=42)
housing_model.fit(X_train_housing, y_train_housing)
# predoct the target variable for the test set
housing_predictions = housing_model.predict(X_test_housing)

array([0.5095   , 0.74161  , 4.9232571, ..., 4.7582187, 0.71409  ,
       1.65083  ], shape=(4128,))

In [15]:
housing_predictions[:10]

array([0.5095   , 0.74161  , 4.9232571, 2.52961  , 2.27369  , 1.64692  ,
       2.37605  , 1.66932  , 2.7729706, 4.9134589])

In [16]:
np.array(y_test_housing[:10])

array([0.477  , 0.458  , 5.00001, 2.186  , 2.78   , 1.587  , 1.982  ,
       1.575  , 3.4    , 4.466  ])

In [None]:
# Calculate the Mean Squared Error of the Random Forest Regressor model
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test_housing, housing_predictions)
print(f"Mean Squared Error of the Random Forest Regressor model: {mse}")

Mean Squared Error of the Random Forest Regressor model: 0.2553684927247781


In [18]:
# Compare the actual vs predicted values
housing_comparison_df = pd.DataFrame({'Actual': y_test_housing, 'Predicted': housing_predictions})
print(housing_comparison_df.head())

        Actual  Predicted
20046  0.47700   0.509500
3024   0.45800   0.741610
15663  5.00001   4.923257
20484  2.18600   2.529610
9814   2.78000   2.273690
