In [110]:
def load_data():

    import pandas as pd

    data_test=pd.read_csv("../files/input/test_data.csv.zip",
                          index_col=False,
                          compression="zip")
    data_traint=pd.read_csv("../files/input/train_data.csv.zip", 
                            index_col=False, 
                            compression="zip")
    
    return data_test,data_traint

In [103]:
def clean_data(data):
    df=data.copy()
    df.rename(columns={"default payment next month":"default"}, inplace=True)
    df.drop(columns=['ID'], inplace=True)
    df.dropna()
    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: x if x <= 4 else 4)
    return df



In [114]:

def make_train_test_split(data_test,data_traint):

    x_train = data_traint.drop(columns=['default']) 
    y_train = data_traint['default']  
    x_test = data_test.drop(columns=['default'])  
    y_test = data_test['default']  
    
    return x_train, x_test, y_train, y_test

In [None]:
# def make_train_test_split(x, y):

#     from sklearn.model_selection import train_test_split

#     (x_train, x_test, y_train, y_test) = train_test_split(
#         x,
#         y,
#         test_size=0.25,
#         random_state=123456,
#     )
#     return x_train, x_test, y_train, y_test

In [125]:
def make_pipeline(estimator,data):

    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.pipeline import Pipeline

    string_columns = data.columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), string_columns)
        ],
        remainder='passthrough' 
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ("estimator", estimator)
        ],
        verbose=False,
    )

    return pipeline

In [120]:
def make_grid_search(estimator, param_grid, cv=10):

    from sklearn.model_selection import GridSearchCV

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring="balanced_accuracy",
    )

    return grid_search 

In [117]:
def save_estimator_compressed(estimator, file_path="files/models/model.pkl.gz"):
    import gzip
    import pickle
    with gzip.open(file_path, "wb") as file:
        pickle.dump(estimator, file)

In [118]:
def load_estimator_compressed(file_path="files/models/model.pkl.gz"):
    import gzip
    import pickle
    # Abrir el archivo comprimido en modo de lectura binaria
    with gzip.open(file_path, "rb") as file:
        estimator = pickle.load(file)
    return estimator

In [128]:
def train_RandomForestClassifier():

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import balanced_accuracy_score

    data_test,data_traint = load_data()

    data_test=clean_data(data_test)
    data_traint=clean_data(data_traint)

    x_train, x_test, y_train, y_test = make_train_test_split(
        data_test=data_test,
        data_traint=data_traint
        
    )

    pipeline = make_pipeline(
        estimator=RandomForestClassifier(n_estimators=100, random_state=42)
    )

    estimator = make_grid_search(
        estimator=pipeline,
        param_grid={
               'classifier__n_estimators': [100, 200],
                'classifier__max_depth': [None, 10, 20],
                'classifier__min_samples_split': [2, 5],
                'classifier__min_samples_leaf': [1, 2],
                'classifier__max_features': ['auto', 'sqrt'],
                'classifier__class_weight': [None, 'balanced']
        },
        cv=10,
    )

    estimator.fit(x_train, y_train)

    best_estimator = load_estimator_compressed()

    if best_estimator is not None:

        saved_balanced_accuracy = balanced_accuracy_score(
            y_true=y_test, y_pred=best_estimator.predict(x_test)
        )

        current_balanced_accuracy = balanced_accuracy_score(
            y_true=y_test, y_pred=estimator.predict(x_test)
        )

        if current_balanced_accuracy < saved_balanced_accuracy:
            estimator = best_estimator

    save_estimator_compressed(estimator)

train_RandomForestClassifier()

TypeError: make_pipeline() missing 1 required positional argument: 'data'

sin funciones 

In [106]:
import pandas as pd

data_test=pd.read_csv("../files/input/test_data.csv.zip",
                    index_col=False,
                    compression="zip")
data_traint=pd.read_csv("../files/input/train_data.csv.zip", 
                        index_col=False, 
                        compression="zip")
data_test.head(5)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0
2,11,200000,2,3,2,34,0,0,2,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,15,250000,1,1,2,29,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,16,50000,2,3,3,23,1,2,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0


In [107]:
data_test.rename(columns={"default payment next month":"default"}, inplace=True)
data_traint.rename(columns={"default payment next month":"default"}, inplace=True)
data_test.drop(columns=['ID'], inplace=True)
data_traint.drop(columns=['ID'], inplace=True)
data_test.dropna()
data_traint.dropna()
data_test['EDUCATION'] = data_test['EDUCATION'].apply(lambda x: x if x <= 4 else 4)
data_traint['EDUCATION'] = data_traint['EDUCATION'].apply(lambda x: x if x <= 4 else 4)

In [108]:
data_test['default']

0       1
1       0
2       0
3       0
4       0
       ..
8995    0
8996    0
8997    0
8998    1
8999    1
Name: default, Length: 9000, dtype: int64