In [378]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline

# To Avoid Warnings
import warnings
warnings.filterwarnings("ignore")

# Breast Cancer Wisconsin (Original) DATASET

In [196]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 

In [197]:
X.columns

Index(['Clump_thickness', 'Uniformity_of_cell_size',
       'Uniformity_of_cell_shape', 'Marginal_adhesion',
       'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin',
       'Normal_nucleoli', 'Mitoses'],
      dtype='object')

In [198]:
y['Class'].unique()       # 2 for bignin & 4  for malignant

array([2, 4], dtype=int64)

In [199]:
X.drop('Bare_nuclei',axis=1,inplace=True)

In [200]:
X.isnull().sum()

Clump_thickness                0
Uniformity_of_cell_size        0
Uniformity_of_cell_shape       0
Marginal_adhesion              0
Single_epithelial_cell_size    0
Bland_chromatin                0
Normal_nucleoli                0
Mitoses                        0
dtype: int64

In [201]:
y['Class'] = y['Class'].apply(lambda x : 0 if x ==  2 else 1)

In [202]:
y['Class'].value_counts()

Class
0    458
1    241
Name: count, dtype: int64

In [203]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, random_state=24, test_size=0.3)

In [204]:
lr = LogisticRegression()
lr.fit(X,y)
y_pred = lr.predict(X_test)
# pd.crosstab(y_test['Class'].values,y_pred)     # insted of that we can use Confusion_Metrix() & Accoracy Score()

In [205]:
(135+68)/(135+68+4+3)

0.9666666666666667

In [206]:
print("Accuracy Score : ",accuracy_score(y_test['Class'],y_pred))
print("Confusion Metrix :\n ", confusion_matrix(y_test['Class'],y_pred))


Accuracy Score :  0.9666666666666667
Confusion Metrix :
  [[135   3]
 [  4  68]]


In [207]:
y['Class'].value_counts(normalize=True)

Class
0    0.655222
1    0.344778
Name: proportion, dtype: float64

In [208]:
y_test['Class'].value_counts()

Class
0    138
1     72
Name: count, dtype: int64

# Naive Rule (BaseLine Model) :
- always try this before applying Machine Learning Model ( if this model have more score than ML Model then Don't Use that ML model)

In [209]:
y_pred = np.zeros(210)

In [210]:
y_test['Class'].value_counts()

Class
0    138
1     72
Name: count, dtype: int64

In [211]:
accuracy_score(y_test['Class'],y_pred)    # something Wrong in code

0.6571428571428571

# Breast Cancer Wisconsin (Diagnostic) DATASET

In [212]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 

In [213]:
y['Diagnosis'] = y['Diagnosis'].apply(lambda x : 0 if x ==  "B" else 1)
# OR
# lbl = LabelEncoder()
# y['Diagnosis'] = lbl.fit_transform(y['Diagnosis'])

In [214]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, random_state=24, test_size=0.3)

In [215]:
lr = LogisticRegression()
lr.fit(X,y)
y_pred = lr.predict(X_test)

In [216]:
print("Accuracy Score : ",accuracy_score(y_test['Diagnosis'],y_pred))
print("Confusion Metrix :\n ", confusion_matrix(y_test['Diagnosis'],y_pred))

Accuracy Score :  0.9532163742690059
Confusion Metrix :
  [[102   4]
 [  4  61]]


In [217]:
y['Diagnosis'].value_counts()

Diagnosis
0    357
1    212
Name: count, dtype: int64

In [218]:
y_pred = np.zeros(y_test.shape[0])

In [219]:
accuracy_score(y_test['Diagnosis'],y_pred)

0.6198830409356725

# IRIS Dataset

In [245]:
iris = pd.read_csv("iris.csv")

In [246]:
lbl = LabelEncoder()
iris['Species'] = lbl.fit_transform(iris['Species'])

In [247]:
y = iris['Species']
X = iris.drop('Species',axis='columns')

In [252]:
y.value_counts()

Species
0    50
1    50
2    50
Name: count, dtype: int64

In [253]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, random_state=24, test_size=0.3)

In [254]:
lr = LogisticRegression()
lr.fit(X,y)
y_pred = lr.predict(X_test)

In [256]:
lbl.classes_

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [257]:
confusion_matrix(y_test,y_pred)

array([[15,  0,  0],
       [ 0, 11,  1],
       [ 0,  0, 18]], dtype=int64)

In [258]:
accuracy_score(y_test,y_pred)

0.9777777777777777

In [267]:
y_pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [264]:
y_test.value_counts()

Species
2    18
0    15
1    12
Name: count, dtype: int64

In [266]:
accuracy_score(y_test,y_pred)

0.3333333333333333

# Default Dataset

In [317]:
df = pd.read_csv('Default.csv')
df.columns

Index(['default', 'student', 'balance', 'income'], dtype='object')

In [318]:
df['default'] = df['default'].apply(lambda x : 0 if x ==  "No" else 1)
df['student'] = df['student'].apply(lambda x : 0 if x ==  "No" else 1)

In [319]:
y = df['default']
X = df.drop('default',axis = 'columns')

In [320]:
df['default'].value_counts()

default
0    9667
1     333
Name: count, dtype: int64

In [329]:
df['student'].value_counts()

student
0    7056
1    2944
Name: count, dtype: int64

In [322]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, random_state=24, test_size=0.3)

In [323]:
lr = LogisticRegression()
lr.fit(X,y)
y_pred = lr.predict(X_test)

In [324]:
confusion_matrix(y_test,y_pred)

array([[2886,   15],
       [  74,   25]], dtype=int64)

In [325]:
accuracy_score(y_test,y_pred)

0.9703333333333334

In [344]:
y_test.shape

(3000,)

In [345]:
# Naive Rule Check

In [346]:
y_pred = np.zeros(df.shape[0])
accuracy_score(df['default'],y_pred)   

0.9667

In [347]:
# OR we can do this using ct & Pipeline

In [348]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, random_state=24, test_size=0.3)

In [352]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')

In [380]:
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)),
                            (ohe, make_column_selector(dtype_include=object)),
                             verbose_feature_names_out=False).set_output(transform='pandas')
pipe = Pipeline([('CT', ct),('LR', lr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9074074074074074


# With K-Fold Cross-Validation

In [381]:
results = cross_val_score(pipe, X, y)
results.mean()

0.9555555555555555

# Wine DataSet

In [366]:
wine = pd.read_csv('wine.csv')
wine

Unnamed: 0,Class,Alcohol,Malic,Ash,Alcalinity,Magnesium,Phenols,Flavanoids,Nonflavanoid,Proanthocyanins,Intensity,Hue,OD280,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [370]:
y = wine['Class']
X = wine.drop('Class', axis=1)

In [371]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, random_state=24, test_size=0.3)

In [372]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [373]:
accuracy_score(y_test,y_pred)

0.9074074074074074