# Chapter 6 - Solving Classification Problems with Scikit-Learn Library

## 6.1. Preparing Data for Classification Problems

In [1]:
# importing required libraries
import pandas as pd
import numpy as np

In [2]:
# importing the dataset
churn_df = pd.read_csv("E:\Hands on Python for Data Science and Machine Learning\Datasets\customer_churn.csv")
churn_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
#removing unncessary columns
churn_df = churn_df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

### 6.1.1. Dividing Data into Features and Labels

In [4]:
#creating feature set
X = churn_df.drop(['Exited'], axis=1)

#creating label set
y = churn_df['Exited']


In [5]:
#printing feature set
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [6]:
#printing label set
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

### 6.1.2. Converting Categorical Data to Numbers



In [7]:
#dropping categorical columns
numerical = X.drop(['Geography', 'Gender'], axis = 1)


In [8]:
#printing numerical columns only
numerical.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,42,2,0.0,1,1,1,101348.88
1,608,41,1,83807.86,1,0,1,112542.58
2,502,42,8,159660.8,3,1,0,113931.57
3,699,39,1,0.0,2,0,0,93826.63
4,850,43,2,125510.82,1,1,1,79084.1


In [9]:
#filtering categorical columns
categorical = X.filter(['Geography', 'Gender'])
categorical.head()

Unnamed: 0,Geography,Gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female


In [10]:
#converting categorical columns to one hot encoded columns
import pandas as pd
cat_numerical = pd.get_dummies(categorical,drop_first=True)
cat_numerical.head()

Unnamed: 0,Geography_Germany,Geography_Spain,Gender_Male
0,0,0,0
1,0,1,0
2,0,0,0
3,0,0,0
4,0,1,0


In [11]:
#concating numerical columns with one-hot encoded columns
X = pd.concat([numerical, cat_numerical], axis = 1)
X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


### 6.1.3. Divide Data into Training and Test Sets

In [12]:
#dividing data into the training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=0)

### 6.1.4. Data Scaling/Normalization

In [13]:
#applying standard scaling to the dataset
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

## 6.2. Binary Classification Problems

### 6.2.1. Logistic Regression 

In [14]:
#importing logistic regression classifier from sklearn
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()

#training the logistic regression classifier
classifier = log_clf.fit(X_train, y_train)

#making predictions on the test set
y_pred = classifier.predict(X_test)

#evaluating the algorithm on test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1526   69]
 [ 309   96]]
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1595
           1       0.58      0.24      0.34       405

    accuracy                           0.81      2000
   macro avg       0.71      0.60      0.61      2000
weighted avg       0.78      0.81      0.78      2000

0.811


### 6.2.2. KNN Classification

In [15]:
#importing KNN classifier from sklearn
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5)

#training the KNN classifier
classifier = knn_clf.fit(X_train, y_train)

#making predictions on the test set
y_pred = classifier.predict(X_test)

#evaluating the algorithm on test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1486  109]
 [ 237  168]]
              precision    recall  f1-score   support

           0       0.86      0.93      0.90      1595
           1       0.61      0.41      0.49       405

    accuracy                           0.83      2000
   macro avg       0.73      0.67      0.69      2000
weighted avg       0.81      0.83      0.81      2000

0.827


### 6.2.3. Random Forest Classification

In [16]:
#importing random forest classifier from sklearn
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)

#training the random forest classifier
classifier = rf_clf.fit(X_train, y_train)

#making predictions on the test set
y_pred = classifier.predict(X_test)


#evaluating the algorithm on test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1521   74]
 [ 196  209]]
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1595
           1       0.74      0.52      0.61       405

    accuracy                           0.86      2000
   macro avg       0.81      0.73      0.76      2000
weighted avg       0.86      0.86      0.86      2000

0.865


### 6.2.4. K Fold Cross Validation

In [17]:
#importing cross validation model from the sklearn
from sklearn.model_selection import cross_val_score

#applying 5 fold cross validation
print(cross_val_score(classifier, X, y, cv=5, scoring ="accuracy"))

[0.864  0.8725 0.8625 0.863  0.8625]


### 6.2.5. Predicting a Single Value

In [18]:
#printing information about the 101th record in the dataset
churn_df.loc[100]

CreditScore           665
Geography          France
Gender             Female
Age                    40
Tenure                  6
Balance                 0
NumOfProducts           1
HasCrCard               1
IsActiveMember          1
EstimatedSalary    161848
Exited                  0
Name: 100, dtype: object

In [19]:
# importing random forest classifier from sklearn
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)

#training the algorithm on the training set
classifier = rf_clf.fit(X_train, y_train)

# scaling single record
single_record = sc.transform (X.values[100].reshape(1, -1))

#making predictions on the 101th record from the dataset
predicted_churn = classifier.predict(single_record)
print(predicted_churn)

[0]


## 6.3. Multi-Class Classification

In [20]:
# example of multiclass classification problem

from sklearn.datasets import make_classification

# create dummy dataset
X, y = make_classification(n_samples=2000, n_features=12, n_informative=8, n_redundant=4, n_classes=4, random_state=42)

# print dataset shape
print(X.shape, y.shape)

(2000, 12) (2000,)


In [21]:
np.unique(y)

array([0, 1, 2, 3])

In [22]:
#dividing data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=0)

# feature scaling using standard scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

In [23]:
#importing random forest classifier from sklearn
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)

#training the random forest classifier
classifier = rf_clf.fit(X_train, y_train)

#making predictions on the test set
y_pred = classifier.predict(X_test)


#evaluating the algorithm on test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[74  7  6  4]
 [ 5 92  8  4]
 [ 5  7 82 14]
 [11  3  6 72]]
              precision    recall  f1-score   support

           0       0.78      0.81      0.80        91
           1       0.84      0.84      0.84       109
           2       0.80      0.76      0.78       108
           3       0.77      0.78      0.77        92

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400

0.8


### 6.3.1. One-vs-Rest for Multiclass Classification

In [24]:
# imoport one vs rest and logistic regression model from sklearn
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()

# define the ovr strategy
clf = OneVsRestClassifier(log_clf)

#training the logistic regression classifier
classifier = clf.fit(X_train, y_train)

#making predictions on the test set
y_pred = classifier.predict(X_test)

#evaluating the algorithm on test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[58 11 15  7]
 [ 9 71  9 20]
 [ 9 23 53 23]
 [17 11 11 53]]
              precision    recall  f1-score   support

           0       0.62      0.64      0.63        91
           1       0.61      0.65      0.63       109
           2       0.60      0.49      0.54       108
           3       0.51      0.58      0.54        92

    accuracy                           0.59       400
   macro avg       0.59      0.59      0.59       400
weighted avg       0.59      0.59      0.59       400

0.5875


### 6.3.2. One-vs-One for Multiclass Classification

In [25]:
# imoport one vs one and logistic regression model from sklearn
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()

# define the ovr strategy
clf = OneVsOneClassifier(log_clf)

#training the logistic regression classifier
classifier = clf.fit(X_train, y_train)

#making predictions on the test set
y_pred = classifier.predict(X_test)

#evaluating the algorithm on test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[59  8 15  9]
 [ 8 75 10 16]
 [ 6 23 54 25]
 [ 9  8 10 65]]
              precision    recall  f1-score   support

           0       0.72      0.65      0.68        91
           1       0.66      0.69      0.67       109
           2       0.61      0.50      0.55       108
           3       0.57      0.71      0.63        92

    accuracy                           0.63       400
   macro avg       0.64      0.64      0.63       400
weighted avg       0.64      0.63      0.63       400

0.6325


## 6.4. Multi-lable Classification

In [28]:
# example of multiclass classification problem

from sklearn.datasets import make_multilabel_classification

# create dummy dataset
X, y = make_multilabel_classification(n_samples=2000, n_features=10, n_classes=5, n_labels=3, random_state=42)

# print dataset shape
print(X.shape, y.shape)

(2000, 10) (2000, 5)


In [30]:
y[200]

array([0, 1, 0, 1, 1])

In [31]:
#dividing data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=0)

# feature scaling using standard scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

In [33]:
#importing random forest classifier from sklearn
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)

#training the random forest classifier
classifier = rf_clf.fit(X_train, y_train)

#making predictions on the test set
y_pred = classifier.predict(X_test)


#evaluating the algorithm on test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.75      0.77       189
           1       0.87      0.94      0.90       294
           2       0.81      0.93      0.87       239
           3       0.73      0.90      0.81       230
           4       0.73      0.36      0.48       100

   micro avg       0.80      0.84      0.82      1052
   macro avg       0.79      0.78      0.77      1052
weighted avg       0.80      0.84      0.81      1052
 samples avg       0.82      0.83      0.80      1052

0.37


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
# imoport one vs rest and logistic regression model from sklearn
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()

# define the ovr strategy
clf = OneVsRestClassifier(log_clf)

#training the logistic regression classifier
classifier = clf.fit(X_train, y_train)

#making predictions on the test set
y_pred = classifier.predict(X_test)

#evaluating the algorithm on test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.77      0.77       189
           1       0.88      0.90      0.89       294
           2       0.78      0.86      0.82       239
           3       0.71      0.81      0.75       230
           4       0.71      0.37      0.49       100

   micro avg       0.78      0.80      0.79      1052
   macro avg       0.77      0.74      0.74      1052
weighted avg       0.78      0.80      0.78      1052
 samples avg       0.79      0.81      0.76      1052

0.2725


  _warn_prf(average, modifier, msg_start, len(result))


## Exercise 6.1


### Question 1

Which of the following is not an example of classification outputs:

A- True \
B- Red \
C- Male \
D- None of the above

Answer: D
    
    
### Question 2

Which of the following metrics is used for unbalanced classification datasets?

A- Accuracy \
B- F1 \
C- Precision \
D- Recall

Answer: C


### Question 3

Which of the following function is used to convert categorical values to one-hot encoded numerical values?

A- pd.get_onehot() \
B- pd.get_dummies() \
C- pd.get_numeric() \
D- All of the above

Answer: B

## Exercise 6.2

Using the `iris` dataset from seaborn library. Train a classification algorithm of your choice which predicts the specie of the iris plant. Perform all the preprocessing steps

### Solution:

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns

iris_df = sns.load_dataset("iris")

iris_df.head()

X = iris_df.drop(['species'], axis=1)
y = iris_df["species"]


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)

classifier = rf_clf.fit(X_train, y_train)

y_pred = classifier.predict(X_test)


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))


[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        11
  versicolor       1.00      1.00      1.00        13
   virginica       1.00      1.00      1.00         6

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

1.0
