# Basic classifiers for Binary classification

This notebook is based on the following blogs:
1. https://towardsdatascience.com/a-quick-overview-of-5-scikit-learn-classification-algorithms-33fdc11ab0b9

In [2]:
from sklearn.datasets import load_iris
import pandas as pd

## Read data

In [3]:
data = load_iris()

In [9]:
df = pd.DataFrame(data = data['data'], columns = data['feature_names'])
df['species'] = data['target']
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [8]:
df.shape

(150, 4)

In [15]:
df.species.value_counts()

2    50
1    50
0    50
Name: species, dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   species            150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


-----

## Split to train/test

In [22]:
import numpy as np

In [16]:
from sklearn.model_selection import train_test_split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=['species']), 
                                                    df.species,
                                                    test_size = 0.2, 
                                                    random_state=13)

------

## Classifiers

#### 1. Logistic regression Classifier

In [69]:
from sklearn.linear_model import LogisticRegression

In [70]:
clf = LogisticRegression().fit(x_train, y_train)
predictions = clf.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [71]:
predictions[:5]

array([1, 1, 0, 2, 2])

#### Model evaluation

In [72]:
from sklearn.metrics import classification_report

In [73]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.89      1.00      0.94         8
           2       1.00      0.92      0.96        13

    accuracy                           0.97        30
   macro avg       0.96      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



#### 2. KNN Classifier

In [74]:
from sklearn.neighbors import KNeighborsClassifier

In [75]:
neigh = KNeighborsClassifier()

In [76]:
neigh.fit(x_train, y_train)

KNeighborsClassifier()

In [77]:
predictions = neigh.predict(x_test)

In [78]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.73      1.00      0.84         8
           2       1.00      0.77      0.87        13

    accuracy                           0.90        30
   macro avg       0.91      0.92      0.90        30
weighted avg       0.93      0.90      0.90        30



#### 3. Decision Tree

In [79]:
from sklearn.tree import DecisionTreeClassifier

In [80]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)

In [81]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         8
           2       1.00      1.00      1.00        13

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



##### Maybe it overfitted? Let's try cross validation

In [82]:
from sklearn.model_selection import cross_val_score

In [83]:
cv_results = cross_val_score(clf, x_train, y_train, cv=10)

In [84]:
cv_results

array([0.83333333, 0.91666667, 1.        , 0.91666667, 0.83333333,
       1.        , 0.91666667, 1.        , 1.        , 0.91666667])

#### 4. Random Forest 

In [85]:
from sklearn.ensemble import RandomForestClassifier

In [86]:
clf = RandomForestClassifier(random_state=0)
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)

In [87]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.89      1.00      0.94         8
           2       1.00      0.92      0.96        13

    accuracy                           0.97        30
   macro avg       0.96      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



#### 5. Gradient boosting

In [88]:
from sklearn.ensemble import GradientBoostingClassifier

In [95]:
grad_boosting = GradientBoostingClassifier(random_state=0)
grad_boosting.fit(x_train, y_train)
preds = grad_boosting.predict(x_test)

In [96]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.89      1.00      0.94         8
           2       1.00      0.92      0.96        13

    accuracy                           0.97        30
   macro avg       0.96      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30

