#### Regularization
**Overfitting**  is one of the most serious kinds of problems related to machine learning. It occurs when a model learns the training data too well. The model then learns not only the relationships among data but also the noise in the dataset. Overfitted models tend to have good performance with the data used to fit them (the training data), but they behave poorly with unseen data (or test data, which is data not used to fit the model).

In [1]:
import pandas as pd

df = pd.read_csv('Heart.csv')
df.head()

Unnamed: 0,Id,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         303 non-null    int64  
 1   Age        303 non-null    int64  
 2   Sex        303 non-null    int64  
 3   ChestPain  303 non-null    object 
 4   RestBP     303 non-null    int64  
 5   Chol       303 non-null    int64  
 6   Fbs        303 non-null    int64  
 7   RestECG    303 non-null    int64  
 8   MaxHR      303 non-null    int64  
 9   ExAng      303 non-null    int64  
 10  Oldpeak    303 non-null    float64
 11  Slope      303 non-null    int64  
 12  Ca         299 non-null    float64
 13  Thal       301 non-null    object 
 14  AHD        303 non-null    object 
dtypes: float64(2), int64(10), object(3)
memory usage: 35.6+ KB


In [3]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297 entries, 0 to 301
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         297 non-null    int64  
 1   Age        297 non-null    int64  
 2   Sex        297 non-null    int64  
 3   ChestPain  297 non-null    object 
 4   RestBP     297 non-null    int64  
 5   Chol       297 non-null    int64  
 6   Fbs        297 non-null    int64  
 7   RestECG    297 non-null    int64  
 8   MaxHR      297 non-null    int64  
 9   ExAng      297 non-null    int64  
 10  Oldpeak    297 non-null    float64
 11  Slope      297 non-null    int64  
 12  Ca         297 non-null    float64
 13  Thal       297 non-null    object 
 14  AHD        297 non-null    object 
dtypes: float64(2), int64(10), object(3)
memory usage: 37.1+ KB


In [4]:
# le ti shohim ndarjen e te dhenve kategorike apo grupimet e tyre
print(df['ChestPain'].unique())
print(df['AHD'].unique())
print(df['Thal'].unique())

['typical' 'asymptomatic' 'nonanginal' 'nontypical']
['No' 'Yes']
['fixed' 'normal' 'reversable']


In [5]:
df_part = df[['ChestPain', 'AHD', 'Thal']]
for column3 in df_part:
    df[column3] = df[column3].astype('category')
    df[column3] = df[column3].cat.codes

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297 entries, 0 to 301
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         297 non-null    int64  
 1   Age        297 non-null    int64  
 2   Sex        297 non-null    int64  
 3   ChestPain  297 non-null    int8   
 4   RestBP     297 non-null    int64  
 5   Chol       297 non-null    int64  
 6   Fbs        297 non-null    int64  
 7   RestECG    297 non-null    int64  
 8   MaxHR      297 non-null    int64  
 9   ExAng      297 non-null    int64  
 10  Oldpeak    297 non-null    float64
 11  Slope      297 non-null    int64  
 12  Ca         297 non-null    float64
 13  Thal       297 non-null    int8   
 14  AHD        297 non-null    int8   
dtypes: float64(2), int64(10), int8(3)
memory usage: 31.0 KB


In [6]:
df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297 entries, 0 to 301
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         297 non-null    int64  
 1   Age        297 non-null    int64  
 2   Sex        297 non-null    int64  
 3   ChestPain  297 non-null    int8   
 4   RestBP     297 non-null    int64  
 5   Chol       297 non-null    int64  
 6   Fbs        297 non-null    int64  
 7   RestECG    297 non-null    int64  
 8   MaxHR      297 non-null    int64  
 9   ExAng      297 non-null    int64  
 10  Oldpeak    297 non-null    float64
 11  Slope      297 non-null    int64  
 12  Ca         297 non-null    float64
 13  Thal       297 non-null    int8   
 14  AHD        297 non-null    int8   
dtypes: float64(2), int64(10), int8(3)
memory usage: 31.0 KB


In [7]:
X = df.drop(columns='AHD')
y = df['AHD']

In [8]:
# standartizimi ose Z-score

from sklearn.preprocessing import StandardScaler

stand_scaler = StandardScaler()
X_scaler = stand_scaler.fit_transform(X)
X_scaler

array([[-1.71690766,  0.93618065,  0.69109474, ...,  2.26414539,
        -0.72197605, -2.2712801 ],
       [-1.70543663,  1.3789285 ,  0.69109474, ...,  0.6437811 ,
         2.47842525, -0.55917302],
       [-1.69396561,  1.3789285 ,  0.69109474, ...,  0.6437811 ,
         1.41162482,  1.15293406],
       ...,
       [ 1.71292949,  1.48961547,  0.69109474, ...,  0.6437811 ,
         1.41162482,  1.15293406],
       [ 1.72440052,  0.27205887,  0.69109474, ...,  0.6437811 ,
         0.34482438,  1.15293406],
       [ 1.73587155,  0.27205887, -1.44697961, ...,  0.6437811 ,
         0.34482438, -0.55917302]])

In [9]:
from sklearn.model_selection import train_test_split

X_scaler_train, X_scaler_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=34)

In [10]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_scaler_train, y_train)

LogisticRegression()

In [11]:
y_predc_train = log_reg.predict(X_scaler_train)
y_predc_train

array([0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1], dtype=int8)

In [12]:
from sklearn.metrics import accuracy_score

coef_acc = accuracy_score(y_train, y_predc_train)
coef_acc

0.869198312236287