# Introduction to Scikit- Learn(sklearn)

This notebook demonstrates some of the most useful functions
of the beautiful Scikit-Learn library

What we're going to cover:

0. An end-to-end Scikit-Learn workflow
1. Getting the data ready
2. Choose the right estimator/algorithm for our problems
3. Fit the model/algorithm and use it to make predictions on our date
4. Evaluating a model
5. Improve a model
6. Save and load a trained model
7. Putting it all together!


## 0. An end-to-end Scikit-Learn workflow

In [3]:
# 1.Get the data ready
import pandas as pd
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
import numpy as np

In [5]:
# Create X (features matrix)
x = heart_disease.drop("target", axis=1)

# Create y(labels)
y = heart_disease["target"]


In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
#2. Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# We'll kep the default hyperparameters
clf.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [8]:
# 3. Fit the model to the training data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=9)

In [9]:
import sklearn
sklearn.show_versions()


System:
    python: 3.12.12 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 20:16:04) [GCC 11.2.0]
executable: /home/aishwarya/miniconda3/envs/myenv/bin/python
   machine: Linux-6.14.0-37-generic-x86_64-with-glibc2.39

Python dependencies:
      sklearn: 1.8.0
          pip: 25.3
   setuptools: 80.9.0
        numpy: 2.4.1
        scipy: 1.16.3
       Cython: None
       pandas: 2.3.3
   matplotlib: 3.10.8
       joblib: 1.5.3
threadpoolctl: 3.5.0

Built with OpenMP: True

threadpoolctl info:
       user_api: blas
   internal_api: mkl
    num_threads: 6
         prefix: libmkl_rt
       filepath: /home/aishwarya/miniconda3/envs/myenv/lib/libmkl_rt.so.2
        version: 2025.0-Product
threading_layer: intel

       user_api: openmp
   internal_api: openmp
    num_threads: 12
         prefix: libiomp
       filepath: /home/aishwarya/miniconda3/envs/myenv/lib/libiomp5.so
        version: None

       user_api: openmp
   internal_api: openmp
    num_threads: 12
         prefix: libgomp


In [10]:
clf.fit(x_train, y_train);


In [11]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
252,62,0,0,138,294,1,1,106,0,1.9,1,3,2
57,45,1,0,115,260,0,0,185,0,0.0,2,0,2
282,59,1,2,126,218,1,1,134,0,2.2,1,1,1
137,62,1,1,128,208,1,0,140,0,0.0,2,0,2
176,60,1,0,117,230,1,1,160,1,1.4,2,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,67,1,0,120,237,0,1,71,0,1.0,1,0,2
182,61,0,0,130,330,0,0,169,0,0.0,2,0,2
180,55,1,0,132,353,0,1,132,1,1.2,1,1,3
261,52,1,0,112,230,0,1,160,0,0.0,2,1,2


In [12]:
x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
241,59,0,0,174,249,0,1,143,1,0.0,1,0,2
74,43,0,2,122,213,0,1,165,0,0.2,1,0,2
239,35,1,0,126,282,0,0,156,1,0.0,2,0,3
88,54,0,2,110,214,0,1,158,0,1.6,1,0,2
153,66,0,2,146,278,0,0,152,0,0.0,1,1,2
17,66,0,3,150,226,0,1,114,0,2.6,0,0,2
260,66,0,0,178,228,1,1,165,1,1.0,1,2,3
233,64,1,0,120,246,0,0,96,1,2.2,0,1,2
34,51,1,3,125,213,0,0,125,1,1.4,2,1,2


In [13]:
# make a prediction
y_label = clf.predict(np.array([0, 2, 3, 4])

SyntaxError: incomplete input (3386092011.py, line 2)

In [None]:
y_preds = clf.predict(x_test)
y_preds

In [None]:
y_test

In [None]:
# Evaluate the model on the training data and test data
clf.score(x_train, y_train)

In [None]:
clf.score(x_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))



In [None]:
confusion_matrix(y_test, y_preds)

In [None]:
accuracy_score(y_test, y_preds)

In [None]:
#5. Improve a model 
# Try different amount of n_estimators
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model accuracy on test set: {clf.score(x_test, y_test)  * 100:.2f}%")
    print("")

In [None]:
# Save a model and load it
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [None]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(x_test, y_test)

In [None]:
#Lets listify the contents
what_were_covering = [
    "1. An end-to-end Scikit-Learn workflow",
    "2. Getting the data ready",
    "3. Choose the right estimator/algorithm for our problems",
    "4. Fit the model/algorithm and use it to make predictions on our date",
    "5. Evaluating a model",
    "6. Improve a model",
    "7. Save and load a trained model",
    "8. Putting it all together!"]


In [None]:
what_were_covering

In [None]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Getting our data ready to be used with machine learning

Three main things we have to do:                                                                                             
     1. Split the data into features and labels(usually 'x' & 'y')                                                           
     2. Filling( also called imputing) or disregarding missing values                                                        
     3. Converting non-numerical values to numerical values (also called feature encoding).
     

In [None]:
heart_disease.head()

In [None]:
x = heart_disease.drop("target", axis=1)
x.head()

In [None]:
y = heart_disease["target"]
y.head()

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)


In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x.shape[0] * 0.8

In [None]:
len(heart_disease)

# 1.1Make sure its all numerical 

In [None]:
car_sales = pd.read_csv("car-sales.csv")
car_sales.head

In [None]:
len(car_sales)

In [None]:
car_sales.dtypes

In [None]:
# Split into x/y
x = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

#Split into training and test
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2)
    
                                                        

In [None]:
# Build machine learning model 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
model.score(X_test, y_test)


In [None]:
x.head()

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                                  remainder= "passthrough")
transformed_x = transformer.fit_transform(x)
transformed_x

In [None]:
x.head()

In [None]:
pd.DataFrame(transformed_x)

In [None]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

In [None]:
# Let's refit the model 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(
    transformed_x, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)




In [None]:
x.head()

In [None]:
model.score(x_test, y_test)

In [None]:
model.score(x_test, y_test)

## 1.2What if there were misssing values ?

1. Fill them with some value(also kown as imputation).
2. Remove the samples with missing data altogether.

 

In [14]:
# Import car sales missing data
car_sales_missing = pd.read_csv("car-sales.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [17]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [None]:
# Create x and y 
x = car_sales_missing.drop("Price", axis=1)
y = car_sales

In [18]:
# Lets try and convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                                  remainder= "passthrough")
transformed_x = transformer.fit_transform(x)
transformed_x

ValueError: A given column is not a column of the dataframe

In [19]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [20]:
car_sales_missing["Doors"].value_counts()

Doors
4    8
3    1
5    1
Name: count, dtype: int64

# Option 1: Fill missing data with Pandas

In [23]:
 #Fill the "Make" column
car_sales_missing["Make"].fillna("missing", inplace=True)

# Fill the "Colour" column 
car_sales_missing["Colour"].fillna("missing", inplace=True)

# Fill the "Odometer (KM)" column with mean
car_sales_missing["Odometer (KM)"].fillna(
    car_sales_missing["Odometer (KM)"].mean(),
    inplace=True
)
#Fill the "Doors" column
car_sales_missing["Doors"].fillna(4, inplace=True)

In [24]:
# Check our dataframe again
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [25]:
len(car_sales_missing)

10

In [26]:
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]


In [28]:
# Lets try and convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                                  remainder= "passthrough")
transformed_x = transformer.fit_transform(car_sales_missing)
transformed_x

array([[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0,
        150043, '$4,000.00'],
       [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
        87899, '$5,000.00'],
       [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
        32549, '$7,000.00'],
       [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
        11179, '$22,000.00'],
       [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0,
        213095, '$3,500.00'],
       [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
        99213, '$4,500.00'],
       [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
        45698, '$7,500.00'],
       [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
        54738, '$7,000.00'],
       [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0,
        60000, '$6,250.00'],
       [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0,
        31600, '$9,700.00']], dtype=object)

## Option 2: Fill missing values with Scikit-Learn

In [29]:
car_missing_missing = pd.read_csv("car-sales.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [30]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64