# Scikit Learn

In [1]:
#!pip install scikit-learn

* ## Classification
  1. data preparation
  2. model definition
  3. training the model
  4. testing the model
  5. evaluate the model
  6. saving the model and use the same model in real world


## Data Preparation

In [2]:
import pandas as pd

In [3]:
heart_df=pd.read_csv("C:/Users/ChiaLeilypour/Downloads/heart.csv")
heart_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
heart_df.isnull().count()

age         303
sex         303
cp          303
trestbps    303
chol        303
fbs         303
restecg     303
thalach     303
exang       303
oldpeak     303
slope       303
ca          303
thal        303
target      303
dtype: int64

In [5]:
heart_df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [6]:
X=heart_df.drop("target", axis=True)
y=heart_df['target']


In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
clf=RandomForestClassifier()

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x_train, x_test, y_train, y_test = train_test_split (X, y, test_size=0.2)

In [11]:
clf.fit(x_train, y_train)

In [12]:
clf.score(x_test,y_test)

0.8032786885245902

In [13]:
clf._parameter_constraints

{'n_estimators': [<sklearn.utils._param_validation.Interval at 0x20f0cf13620>],
 'bootstrap': ['boolean'],
 'oob_score': ['boolean', <function callable(obj, /)>],
 'n_jobs': [numbers.Integral, None],
 'random_state': ['random_state'],
 'verbose': ['verbose'],
 'warm_start': ['boolean'],
 'max_samples': [None,
  <sklearn.utils._param_validation.Interval at 0x20f0cf13650>,
  <sklearn.utils._param_validation.Interval at 0x20f0cf13680>],
 'max_depth': [<sklearn.utils._param_validation.Interval at 0x20f0cbaaae0>,
  None],
 'min_samples_split': [<sklearn.utils._param_validation.Interval at 0x20f0cdc3980>,
  <sklearn.utils._param_validation.Interval at 0x20f0cee7860>],
 'min_samples_leaf': [<sklearn.utils._param_validation.Interval at 0x20f0cee7890>,
  <sklearn.utils._param_validation.Interval at 0x20f0cee78c0>],
 'min_weight_fraction_leaf': [<sklearn.utils._param_validation.Interval at 0x20f0cee78f0>],
 'max_features': [<sklearn.utils._param_validation.Interval at 0x20f0cee7920>,
  <sklearn.

In [14]:
y_preds=clf.predict(x_test)
y_preds

array([1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1], dtype=int64)

In [15]:
from sklearn.metrics import classification_report , confusion_matrix, accuracy_score

In [16]:
print(classification_report(y_true=y_test, y_pred = y_preds))

              precision    recall  f1-score   support

           0       0.82      0.69      0.75        26
           1       0.79      0.89      0.84        35

    accuracy                           0.80        61
   macro avg       0.81      0.79      0.79        61
weighted avg       0.80      0.80      0.80        61



In [17]:
print (confusion_matrix(y_test,y_preds))

[[18  8]
 [ 4 31]]


In [18]:
accuracy_score(y_test, y_preds)

0.8032786885245902

In [19]:
for i in range(10,100,10):
    print(f"the n estimator is {i}")
    clf1=RandomForestClassifier(n_estimators=i)
    clf1.fit(x_train, y_train)
    print(f"Model score is:{clf1.score(x_test,y_test) *100:.2f} %")
    print("----------------------------------")

the n estimator is 10
Model score is:81.97 %
----------------------------------
the n estimator is 20
Model score is:73.77 %
----------------------------------
the n estimator is 30
Model score is:77.05 %
----------------------------------
the n estimator is 40
Model score is:77.05 %
----------------------------------
the n estimator is 50
Model score is:80.33 %
----------------------------------
the n estimator is 60
Model score is:78.69 %
----------------------------------
the n estimator is 70
Model score is:83.61 %
----------------------------------
the n estimator is 80
Model score is:81.97 %
----------------------------------
the n estimator is 90
Model score is:77.05 %
----------------------------------


## Saving

In [20]:
import pickle

In [21]:
pickle.dump(clf,open("classifiaction_model_1.pkl","wb"))

In [22]:
loaded_model=pickle.load(open("classifiaction_model_1.pkl","rb"))

In [23]:
print(confusion_matrix(y_test,y_preds))

[[18  8]
 [ 4 31]]


In [24]:
print(classification_report(y_true=y_test, y_pred = y_preds))


              precision    recall  f1-score   support

           0       0.82      0.69      0.75        26
           1       0.79      0.89      0.84        35

    accuracy                           0.80        61
   macro avg       0.81      0.79      0.79        61
weighted avg       0.80      0.80      0.80        61



# changing non numeric data to numeric 
## (categorical to Numeric)
1. Using `map`  
2. Using `LabelEncoder`  
3. Using `pandas.get_dummies`  
4. Using `OneHotEncoder`

In [25]:
car_sales=pd.read_csv("car-sales.csv")

In [26]:
car_sales_temp1=car_sales
car_sales_temp1

Unnamed: 0,Mark,Color,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Black,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


## Map

In [27]:
color_map={"White":1,"Red":2, "Black":3, "Blue":4}
car_sales_temp1["Color_Map"]=car_sales_temp1["Color"].map(color_map)
car_sales_temp1.drop("Color", axis=True)
car_sales_temp1

Unnamed: 0,Mark,Color,Odometer (KM),Doors,Price,Color_Map
0,Toyota,White,150043,4,"$4,000.00",1
1,Honda,Red,87899,4,"$5,000.00",2
2,Toyota,Blue,32549,3,"$7,000.00",4
3,BMW,Black,11179,5,"$22,000.00",3
4,Nissan,White,213095,4,"$3,500.00",1
5,Toyota,Black,99213,4,"$4,500.00",3
6,Honda,Blue,45698,4,"$7,500.00",4
7,Honda,Blue,54738,4,"$7,000.00",4
8,Toyota,White,60000,4,"$6,250.00",1
9,Nissan,White,31600,4,"$9,700.00",1


## LabelEncoder

In [28]:
car_sales_temp2=car_sales
car_sales_temp2

Unnamed: 0,Mark,Color,Odometer (KM),Doors,Price,Color_Map
0,Toyota,White,150043,4,"$4,000.00",1
1,Honda,Red,87899,4,"$5,000.00",2
2,Toyota,Blue,32549,3,"$7,000.00",4
3,BMW,Black,11179,5,"$22,000.00",3
4,Nissan,White,213095,4,"$3,500.00",1
5,Toyota,Black,99213,4,"$4,500.00",3
6,Honda,Blue,45698,4,"$7,500.00",4
7,Honda,Blue,54738,4,"$7,000.00",4
8,Toyota,White,60000,4,"$6,250.00",1
9,Nissan,White,31600,4,"$9,700.00",1


In [29]:
from sklearn.preprocessing import LabelEncoder

In [30]:
label_en=LabelEncoder()
car_sales_temp2["Color_Encoded"]=label_en.fit_transform(car_sales_temp2["Color"])
car_sales_temp2

Unnamed: 0,Mark,Color,Odometer (KM),Doors,Price,Color_Map,Color_Encoded
0,Toyota,White,150043,4,"$4,000.00",1,3
1,Honda,Red,87899,4,"$5,000.00",2,2
2,Toyota,Blue,32549,3,"$7,000.00",4,1
3,BMW,Black,11179,5,"$22,000.00",3,0
4,Nissan,White,213095,4,"$3,500.00",1,3
5,Toyota,Black,99213,4,"$4,500.00",3,0
6,Honda,Blue,45698,4,"$7,500.00",4,1
7,Honda,Blue,54738,4,"$7,000.00",4,1
8,Toyota,White,60000,4,"$6,250.00",1,3
9,Nissan,White,31600,4,"$9,700.00",1,3


## pd.Get_Dummies

In [31]:
car_sales_temp3=car_sales
car_sales_temp3

Unnamed: 0,Mark,Color,Odometer (KM),Doors,Price,Color_Map,Color_Encoded
0,Toyota,White,150043,4,"$4,000.00",1,3
1,Honda,Red,87899,4,"$5,000.00",2,2
2,Toyota,Blue,32549,3,"$7,000.00",4,1
3,BMW,Black,11179,5,"$22,000.00",3,0
4,Nissan,White,213095,4,"$3,500.00",1,3
5,Toyota,Black,99213,4,"$4,500.00",3,0
6,Honda,Blue,45698,4,"$7,500.00",4,1
7,Honda,Blue,54738,4,"$7,000.00",4,1
8,Toyota,White,60000,4,"$6,250.00",1,3
9,Nissan,White,31600,4,"$9,700.00",1,3


In [32]:
car_sales_temp3_Encoded=pd.get_dummies(car_sales_temp3,columns=["Color", "Mark"],dtype=int)
car_sales_temp3_Encoded

Unnamed: 0,Odometer (KM),Doors,Price,Color_Map,Color_Encoded,Color_Black,Color_Blue,Color_Red,Color_White,Mark_BMW,Mark_Honda,Mark_Nissan,Mark_Toyota
0,150043,4,"$4,000.00",1,3,0,0,0,1,0,0,0,1
1,87899,4,"$5,000.00",2,2,0,0,1,0,0,1,0,0
2,32549,3,"$7,000.00",4,1,0,1,0,0,0,0,0,1
3,11179,5,"$22,000.00",3,0,1,0,0,0,1,0,0,0
4,213095,4,"$3,500.00",1,3,0,0,0,1,0,0,1,0
5,99213,4,"$4,500.00",3,0,1,0,0,0,0,0,0,1
6,45698,4,"$7,500.00",4,1,0,1,0,0,0,1,0,0
7,54738,4,"$7,000.00",4,1,0,1,0,0,0,1,0,0
8,60000,4,"$6,250.00",1,3,0,0,0,1,0,0,0,1
9,31600,4,"$9,700.00",1,3,0,0,0,1,0,0,1,0


## OneHotEncoder

In [33]:
from sklearn.preprocessing import OneHotEncoder

## Filling empty Data
1. Using `Pandas`
2. Using `Scikit learn`

In [34]:
car_sales=pd.read_csv("car-sales-missing-data.csv")
car_sales

Unnamed: 0,Mark,Color,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [35]:
car_sales.isna().sum()

Mark             49
Color            50
Odometer (KM)    51
Doors            50
Price            50
dtype: int64

In [36]:
car_sales["Mark"]=car_sales["Mark"].fillna("missing")
car_sales

Unnamed: 0,Mark,Color,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,missing,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [37]:
car_sales.isna().sum()

Mark              0
Color            50
Odometer (KM)    51
Doors            50
Price            50
dtype: int64

In [38]:
car_sales["Color"]=car_sales["Color"].fillna("missing")
car_sales

Unnamed: 0,Mark,Color,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,missing,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [39]:
car_sales.isna().sum()

Mark              0
Color             0
Odometer (KM)    51
Doors            50
Price            50
dtype: int64

In [40]:
print(f"The mean is {car_sales["Odometer (KM)"].mean()}")
car_sales["Odometer (KM)"]=car_sales["Odometer (KM)"].fillna(car_sales["Odometer (KM)"].mean())
car_sales.isna().sum()

The mean is 131302.27818756585


Mark              0
Color             0
Odometer (KM)     0
Doors            50
Price            50
dtype: int64

In [41]:
car_sales["Doors"].value_counts()

Doors
4.0    811
5.0     75
3.0     64
Name: count, dtype: int64

In [42]:
car_sales["Doors"]=car_sales["Doors"].fillna(4)

In [43]:
car_sales.isna().sum()

Mark              0
Color             0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [44]:
car_sales=car_sales.dropna()
car_sales

Unnamed: 0,Mark,Color,Odometer (KM),Doors,Price
0,Honda,White,35431.000000,4.0,15323.0
1,BMW,Blue,192714.000000,5.0,19943.0
2,Honda,White,131302.278188,4.0,28343.0
3,Toyota,White,154365.000000,4.0,13434.0
4,Nissan,Blue,181577.000000,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.000000,4.0,32042.0
996,missing,White,155144.000000,3.0,5716.0
997,Nissan,Blue,66604.000000,4.0,31570.0
998,Honda,White,215883.000000,4.0,4001.0


## Filling Missing data with Scikit learn

In [48]:
car_sales_missing=pd.read_csv("car-sales-missing-data.csv")
car_sales_missing

Unnamed: 0,Mark,Color,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [64]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

categorical_imputer=SimpleImputer(strategy="constant", fill_value="missing")
door_imputer=SimpleImputer(strategy="constant", fill_value=4)
number_imputer=SimpleImputer(strategy="mean")

Transformer_1=ColumnTransformer([("cat_imputer", categorical_imputer,["Mark","Color"]),
                                 ("D_imputer",door_imputer,["Doors"]),
                                 ("num_imputer",number_imputer,["Odometer (KM)"])
                                ])
car_sales_missing_data_filled=Transformer_1.fit_transform(car_sales_missing)
car_sales_missing_data_filled_df=pd.DataFrame(car_sales_missing_data_filled)
car_sales_missing_data_filled_df.isna().sum()

0    0
1    0
2    0
3    0
dtype: int64

In [66]:
car_sales_missing_data_filled_df.columns=["Mark","Color","Odometer (KM)","Doors"]
car_sales_missing_data_filled_df

Unnamed: 0,Mark,Color,Odometer (KM),Doors
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,131302.278188
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0
...,...,...,...,...
995,Toyota,Black,4.0,35820.0
996,missing,White,3.0,155144.0
997,Nissan,Blue,4.0,66604.0
998,Honda,White,4.0,215883.0


In [77]:
car_sales_filled=pd.concat([car_sales_missing_data_filled_df,car_sales_missing['Price']], axis=True)
car_sales_filled=car_sales_filled.dropna()
car_sales_filled.isna().sum()

Mark             0
Color            0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64