# SCIKIT LEARN

# STEPS FOR WORKFLOW
1. Getting the data ready
2. Choosing the algorithm
3. fitting the model and using it for predictions
4. evaluate the model
5. improve the model
6. save and load the model
7. putting it all together

In [1]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np

In [4]:
heart_disease = pd.read_csv("./data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [5]:
# 1. Getting the data ready
# features
x = heart_disease.drop("target", axis=1)

# target
y = heart_disease["target"]

x, y

(     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
 0     63    1   3       145   233    1        0      150      0      2.3   
 1     37    1   2       130   250    0        1      187      0      3.5   
 2     41    0   1       130   204    0        0      172      0      1.4   
 3     56    1   1       120   236    0        1      178      0      0.8   
 4     57    0   0       120   354    0        1      163      1      0.6   
 ..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
 298   57    0   0       140   241    0        1      123      1      0.2   
 299   45    1   3       110   264    0        1      132      0      1.2   
 300   68    1   0       144   193    1        1      141      0      3.4   
 301   57    1   0       130   131    0        1      115      1      1.2   
 302   57    0   1       130   236    0        0      174      0      0.0   
 
      slope  ca  thal  
 0        0   0     1  
 1        0   0     2  
 2

In [6]:
# 2. Choosing the algorithm
# !pip install scikit-learn
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

In [7]:
# 3. fitting the model into the data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

y_train, len(x_test)

(162    1
 95     1
 42     1
 294    0
 27     1
       ..
 257    0
 17     1
 173    0
 106    1
 287    0
 Name: target, Length: 242, dtype: int64,
 61)

In [8]:
clf.fit(x_train, y_train);

In [9]:
# Predicting

y_preds = clf.predict(x_test);
y_preds

array([1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1])

In [10]:
# Evaluating model

clf.score(x_train, y_train)

1.0

In [11]:
clf.score(x_test, y_test)

0.6885245901639344

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.65      0.52      0.58        25
           1       0.71      0.81      0.75        36

    accuracy                           0.69        61
   macro avg       0.68      0.66      0.67        61
weighted avg       0.68      0.69      0.68        61



In [13]:
confusion_matrix(y_test, y_preds)

array([[13, 12],
       [ 7, 29]])

In [14]:
accuracy_score(y_test, y_preds)

0.6885245901639344

In [15]:
# 5. Improving the model
 # - chane the estimators size

In [16]:
# Trying with different estimates
np.random.seed(42)
for i in range(1,100, 10):
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(x_train, y_train)
    print(f" score at {i} = {clf.score(x_test, y_test)*100:.2f}% ")

 score at 1 = 68.85% 
 score at 11 = 75.41% 
 score at 21 = 73.77% 
 score at 31 = 72.13% 
 score at 41 = 77.05% 
 score at 51 = 72.13% 
 score at 61 = 70.49% 
 score at 71 = 70.49% 
 score at 81 = 75.41% 
 score at 91 = 73.77% 


In [17]:
np.random.seed(42)
clf = RandomForestClassifier(n_estimators=15)
clf.fit(x_train, y_train)
print(f" score at {15} = {clf.score(x_test, y_test)*100:.2f}% ")

 score at 15 = 77.05% 


In [18]:
# 6. Saving a model
import pickle

pickle.dump(clf, open("random_forest_classifer_heart_disease_1.pkl", "wb")) # wb = write binary

In [19]:
# loading it again

loaded_model = pickle.load(open("./random_forest_classifer_heart_disease_1.pkl", "rb"))

loaded_model.score(x_test, y_test)

0.7704918032786885

## 1 GETTING DATA READY

### Steps
1. Split data into features and labels ( e.g. x and y as in target )
2. clean, transform and Reduce
3. Convert non number values into number values

In [22]:
car_sales = pd.read_csv("./data/car-sales-extended.csv")
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [25]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [28]:
x = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

x.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [31]:
# convert categorical data ( whether string or number ) into numbers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_x = transformer.fit_transform(x)
# transformed_x
# to view beautifully
pd.DataFrame(transformed_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [39]:
#  Fitting the model
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
x_train, x_test, y_train, y_test, = train_test_split(transformed_x, y, test_size=0.2)

rfr_model = RandomForestRegressor()

rfr_model.fit(x_train, y_train)

rfr_model.score(x_test, y_test) # checking the score


0.3235867221569877

In [85]:
## Working with missing Data ( Manually )

car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")

# car_sales_missing.isna().sum() # check how many missing data

# Loading transformer and Encoder
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# Adjusting data

# print(f"Before : \n{car_sales_missing.isna().sum()}")

# filling na
car_sales_missing["Make"] = car_sales_missing["Make"].fillna("missing")
car_sales_missing["Colour"] = car_sales_missing["Colour"].fillna("missing")
car_sales_missing["Doors"] = car_sales_missing["Doors"].fillna(4)
car_sales_missing["Odometer (KM)"] = car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean())
# print(x["Doors"].value_counts())  # finding maximum doors occurence

car_sales_missing = car_sales_missing.dropna() # Drop rows with missing price

# print(f"\n\nAfter : \n{car_sales_missing.isna().sum()}")

# splitting data into x and y

x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

# print(len(x)  , " = " , len(y))

transformed_x = transformer.fit_transform(x)

# to view beautifully ====
transformed_x_dense = transformed_x.toarray()

pd.DataFrame(transformed_x_dense).head() 

# Splitting the data into test and train
x_train, x_test , y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)


# fitting through model
mdl = RandomForestRegressor()

mdl.fit(x_train, y_train).score(x_test, y_test)

0.3306719327833385

In [None]:
# Working with Missing Data with scikit Learn