In [22]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold,cross_validate

def load():
    df=pd.read_csv("diabetes.csv",sep=",")
    print(df.info())
    class_counts=df['Outcome'].value_counts()
    print("dist clases")
    print(class_counts)
    X=df.drop('Outcome',axis=1)
    y=df['Outcome']
    le=LabelEncoder()
    y_encoded=le.fit_transform(y)
    return df,X,y_encoded

def LogisticR(X_resampled,y_resampled):
    clf=LogisticRegressionCV(cv=5,random_state=42,max_iter=1000)
    scores=cross_val_score(clf, X_resampled,y_resampled)
    print("acc ",scores)
    print("acc mean ",scores.mean())
    
def randomOverS(df,X,y_encoded):
    class_counts=pd.Series(y_encoded).value_counts()
    majority_class=class_counts.idxmax()
    minority_class=class_counts.idxmin()
    majority_count=class_counts.max()
    
    desired_ratio=0.7
    target_minority_count=int(majority_count*desired_ratio)
    sampling_strategy={majority_class:majority_count,minority_class:target_minority_count}
    
    ros=RandomOverSampler(sampling_strategy=sampling_strategy,random_state=42)
    X_resampled,y_resampled=ros.fit_resample(X, y_encoded)
    
    df_resampled=pd.DataFrame(X_resampled,columns=X.columns)
    df_resampled['Outcome']=y_resampled
    
    print("resampled")
    print(df_resampled['Outcome'].value_counts())
    LogisticR(X_resampled,y_resampled)

def main():
    df,X,y=load()
    LogisticR(X,y)
    #randomOverS(df, X, y)
    
    model=Pipeline([
        ('oversample',RandomOverSampler(sampling_strategy=0.7,random_state=42)),
        ('logreg',LogisticRegressionCV(cv=5,random_state=42,max_iter=1000))
        ])
    
    skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    cv_results=cross_validate(
        model,X,y,scoring="balanced_accuracy",
        return_train_score=True,return_estimator=True,cv=skf,n_jobs=1
        )
    
    print(
        f"Balanced accuracy mean +/- std. dev.: "
        f"{cv_results['test_score'].mean():.3f} +/- "
        f"{cv_results['test_score'].std():.3f}"
    )
    
main()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
dist clases
Outcome
0    500
1    268
Name: count, dtype: int64
acc  [0.75974026 0.74675325 0.75974026 0.79738562 0.77124183]
acc mean  0.7669722434428318
Balanced accuracy mean +/- std. dev.: 0.731 +/- 0.028
