<a href="https://colab.research.google.com/github/Abhishek-7504/Scikit-learn-practice/blob/main/Scikit_Learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Introduction to Scikit-Learm

# This notebook is a collection of all the sklearn functions.
what_were_covering = [
"0. End-to-end sklearn workflow",
"1. Getting data ready",
"2. Choosing the right algorithm",
"3. Fitting the model",
"4. Evaluating the model",
"5. Improving the model",
"6. Save and load a trained model",
"7. Putting it all together"
]

In [5]:
# 1. Getting data ready
import pandas as pd
import numpy as np
heart_disease = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/refs/heads/master/data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


**The main takeaways:**


*   Split your data first (into train/test), always keep your training & test data separate

*   Fill/transform the training set and test sets separately (this goes for filling data with pandas as well)

  
*   Don't use data from the future (test set) to fill data from the past (training set)










In [6]:
# Create feature matrix
X = heart_disease.drop("target", axis=1)

# Create labels
y = heart_disease["target"]

In [7]:
# 2. choosing the right model
from sklearn.ensemble import RandomForestClassifier
# clf is Classifier
clf = RandomForestClassifier()

# we will keep default hyperparameters
clf.get_params()

# OOB : OUT-of-BAG is a way to test the model without needing a
# seperate validation set

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [8]:
# 3. fit the model
from sklearn.model_selection import train_test_split
# test size is the dataset used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [9]:
clf.fit(X_train, y_train);

In [10]:
y_preds = clf.predict(X_test)
y_preds

array([0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0])

In [11]:
# 4. evaluate our model
clf.score(X_train, y_train)

1.0

In [12]:
clf.score(X_test, y_test)

0.819672131147541

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.96      0.69      0.80        32
           1       0.74      0.97      0.84        29

    accuracy                           0.82        61
   macro avg       0.85      0.83      0.82        61
weighted avg       0.85      0.82      0.82        61



In [14]:
print(confusion_matrix(y_test, y_preds))

[[22 10]
 [ 1 28]]


In [15]:
print(accuracy_score(y_test, y_preds))

0.819672131147541


In [16]:
# 5. improve a model_selection
np.random.seed(42)
for i in range(10, 100, 10):
  print(f"Trying model with {i} estimators...")

# set a default value for "n estimators"
# or "warnings.filterwarnings("ignore")"

  clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
  accuracy = clf.score(X_test, y_test) * 100
  print(f"Model accuracy on test set: {accuracy:.2f}%")
  print(" ")


Trying model with 10 estimators...
Model accuracy on test set: 83.61%
 
Trying model with 20 estimators...
Model accuracy on test set: 81.97%
 
Trying model with 30 estimators...
Model accuracy on test set: 81.97%
 
Trying model with 40 estimators...
Model accuracy on test set: 83.61%
 
Trying model with 50 estimators...
Model accuracy on test set: 78.69%
 
Trying model with 60 estimators...
Model accuracy on test set: 78.69%
 
Trying model with 70 estimators...
Model accuracy on test set: 81.97%
 
Trying model with 80 estimators...
Model accuracy on test set: 78.69%
 
Trying model with 90 estimators...
Model accuracy on test set: 81.97%
 


In [17]:
# 6. save the model and load
import pickle
pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [18]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(X_test, y_test)

0.819672131147541

In [19]:
# blackbox code to save, load and test the saved model
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Load dataset (replace with your own dataset)
data = load_iris()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf = RandomForestClassifier(n_estimators=50, random_state=42).fit(X_train, y_train)

# Save the model
model_filename = "random_forest_model_1.pkl"
with open(model_filename, "wb") as file:
    pickle.dump(clf, file)
print(f"Model saved to {model_filename}")

# Load the model
with open(model_filename, "rb") as file:
    loaded_model = pickle.load(file)
print("Model loaded successfully.")

# Test the loaded model
accuracy = loaded_model.score(X_test, y_test) * 100
print(f"Loaded model accuracy on test set: {accuracy:.2f}%")

Model saved to random_forest_model_1.pkl
Model loaded successfully.
Loaded model accuracy on test set: 100.00%


In [20]:
what_were_covering

['0. End-to-end sklearn workflow',
 '1. Getting data ready',
 '2. Choosing the right algorithm',
 '3. Fitting the model',
 '4. Evaluating the model',
 '5. Improving the model',
 '6. Save and load a trained model',
 '7. Putting it all together']

**1.Getting data ready**

Three main things in order to get our data ready
  1. Split the data into features i.e., 'X' & 'y'
  2. Disregard the null values or filling ( also called imputing )
  3. Converting non-numerical values into numericals ( feature encoding )

  Clean Data -> Transform Data -> Reduce Data

  For reducing data we can techniques dimensionality reduction .
  

In [21]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [22]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [23]:
len(heart_disease)

303

1.1 making sure all values non-null

In [24]:
car_sales = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/refs/heads/master/data/car-sales-extended.csv")
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [25]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [26]:
len(car_sales)

1000

In [27]:
car_sales.dtypes

Unnamed: 0,0
Make,object
Colour,object
Odometer (KM),int64
Doors,int64
Price,int64


In [28]:
# split the dataset
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

In [29]:
# Build a ML Model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

ValueError: could not convert string to float: 'Honda'

In [30]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Specify the categorical features (excluding the target variable "Price")
categorical_features = ["Make", "Doors", "Colour"]  # Include all categorical features
one_hot = OneHotEncoder()
transformer = ColumnTransformer(
    transformers=[("one_hot", one_hot, categorical_features)],
    remainder='passthrough'
)

# Transform the feature set
transformed_X = transformer.fit_transform(X)

# Display the transformed features
print(transformed_X)

[[0.00000e+00 1.00000e+00 0.00000e+00 ... 0.00000e+00 1.00000e+00
  3.54310e+04]
 [1.00000e+00 0.00000e+00 0.00000e+00 ... 0.00000e+00 0.00000e+00
  1.92714e+05]
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 0.00000e+00 1.00000e+00
  8.47140e+04]
 ...
 [0.00000e+00 0.00000e+00 1.00000e+00 ... 0.00000e+00 0.00000e+00
  6.66040e+04]
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 0.00000e+00 1.00000e+00
  2.15883e+05]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 0.00000e+00 0.00000e+00
  2.48360e+05]]


In [31]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,154365.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,215883.0


In [32]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size =0.2)
model.fit(X_train, y_train)

In [33]:
model.score(X_test, y_test)

0.0

1.2 handling missing values
i)imputation
ii) remove the missing values


In [34]:
car_sales_missing = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/refs/heads/master/data/car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [35]:
car_sales_missing.isna().sum()


Unnamed: 0,0
Make,49
Colour,50
Odometer (KM),50
Doors,50
Price,50


In [36]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [37]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Specify the categorical features (excluding the target variable "Price")
categorical_features = ["Make", "Doors", "Colour"]  # Include all categorical features
one_hot = OneHotEncoder()
transformer = ColumnTransformer(
    transformers=[("one_hot", one_hot, categorical_features)],
    remainder='passthrough'
)

# Transform the feature set
transformed_X = transformer.fit_transform(X)

# Display the transformed features
print(transformed_X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4000 stored elements and shape (1000, 16)>
  Coords	Values
  (0, 1)	1.0
  (0, 6)	1.0
  (0, 13)	1.0
  (0, 15)	35431.0
  (1, 0)	1.0
  (1, 7)	1.0
  (1, 10)	1.0
  (1, 15)	192714.0
  (2, 1)	1.0
  (2, 6)	1.0
  (2, 13)	1.0
  (2, 15)	84714.0
  (3, 3)	1.0
  (3, 6)	1.0
  (3, 13)	1.0
  (3, 15)	154365.0
  (4, 2)	1.0
  (4, 5)	1.0
  (4, 10)	1.0
  (4, 15)	181577.0
  (5, 1)	1.0
  (5, 6)	1.0
  (5, 12)	1.0
  (5, 15)	42652.0
  (6, 3)	1.0
  :	:
  (993, 15)	162523.0
  (994, 0)	1.0
  (994, 5)	1.0
  (994, 10)	1.0
  (994, 15)	163322.0
  (995, 3)	1.0
  (995, 6)	1.0
  (995, 9)	1.0
  (995, 15)	35820.0
  (996, 4)	1.0
  (996, 5)	1.0
  (996, 13)	1.0
  (996, 15)	155144.0
  (997, 2)	1.0
  (997, 6)	1.0
  (997, 10)	1.0
  (997, 15)	66604.0
  (998, 1)	1.0
  (998, 6)	1.0
  (998, 13)	1.0
  (998, 15)	215883.0
  (999, 3)	1.0
  (999, 6)	1.0
  (999, 10)	1.0
  (999, 15)	248360.0


In [38]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [39]:
car_sales_missing.dropna(subset=["Price"], inplace = True)
car_sales_missing.isna().sum()

Unnamed: 0,0
Make,47
Colour,46
Odometer (KM),48
Doors,47
Price,0


In [40]:
from sklearn.model_selection import train_test_split
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

In [41]:
X.isna().sum()

Unnamed: 0,0
Make,47
Colour,46
Odometer (KM),48
Doors,47


In [44]:
# Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_features)
])

# Fill train and test values separately
filled_X_train = imputer.fit_transform(X_train) # fit_transform imputes the missing values from the training set and fills them simultaneously
filled_X_test = imputer.transform(X_test) # tranform takes the imputing missing values from the training set and fills the test set with them

# Check filled X_train
filled_X_train

array([['Honda', 'White', 4.0, 71934.0],
       ['Toyota', 'Red', 4.0, 162665.0],
       ['Honda', 'White', 4.0, 42844.0],
       ...,
       ['Toyota', 'White', 4.0, 196225.0],
       ['Honda', 'Blue', 4.0, 133117.0],
       ['Honda', 'missing', 4.0, 150582.0]], dtype=object)

In [45]:
# Get our transformed data array's back into DataFrame's
car_sales_filled_train = pd.DataFrame(filled_X_train,
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test,
                                     columns=["Make", "Colour", "Doors", "Odometer (KM)"])

# Check missing data in training set
car_sales_filled_train.isna().sum()

Unnamed: 0,0
Make,0
Colour,0
Doors,0
Odometer (KM),0


In [46]:
# Import OneHotEncoder class from sklearn
from sklearn.preprocessing import OneHotEncoder

# Now let's one hot encode the features with the same code as before
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                                 remainder="passthrough")

# Fill train and test values separately
transformed_X_train = transformer.fit_transform(car_sales_filled_train) # fit and transform the training data
transformed_X_test = transformer.transform(car_sales_filled_test) # transform the test data

# Check transformed and filled X_train
transformed_X_train.toarray()

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 7.19340e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.62665e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.28440e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.96225e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.33117e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.50582e+05]])

In [47]:
# fitting a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)

0.21229043336119102