# Model Pipeline Notebook

In [None]:
# %run ./model.requirements.ipynb

In [None]:
def build_pipeline():
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('quantile_transformer', QuantileTransformer(output_distribution='normal')),
        ('std_scaler', StandardScaler()),
    ])
    
    return pipeline

### No One Hot Encoding

In [None]:
def train_test_split_processing(data: pd.DataFrame, target: str, test_size: float, shuffle: bool=True):
    
    feature_set = data.drop(target, axis=1)
    target_set = data[target]

    numerical_columns = feature_set.columns.tolist()

    x_train, x_test, y_train, y_test = train_test_split(feature_set, target_set, test_size=test_size, shuffle=shuffle)
    
    full_pipeline = ColumnTransformer([
        ("num", build_pipeline(), numerical_columns),
    ])
    
    x_train_prepared = full_pipeline.fit_transform(x_train)
    x_test_prepared = full_pipeline.transform(x_test)

    x_train_prepared = pd.DataFrame(x_train_prepared, columns=numerical_columns)
    x_test_prepared = pd.DataFrame(x_test_prepared, columns=numerical_columns)

    return x_train_prepared, x_test_prepared, y_train, y_test

### With One Hot Encoding

In [None]:
def train_test_split_processing(data: pd.DataFrame, target: str, test_size: float, shuffle: bool=True):
    categorical_columns = ['']  # specify which columns are categorical
    
    for col in categorical_columns:
        labelencoder = LabelEncoder()
        data[col] = labelencoder.fit_transform(data[col])
    
    feature_set = data.drop(target, axis=1)
    target_set = data[target]

    numerical_columns = list(set(feature_set.columns) - set(categorical_columns))
    
    full_pipeline = ColumnTransformer([
        ("num", build_pipeline(), numerical_columns),
    ])
    
    feature_set_prepared = full_pipeline.fit_transform(feature_set)
    feature_set_prepared = pd.DataFrame(feature_set_prepared, columns=numerical_columns)
    
    x_train, x_test, y_train, y_test = train_test_split(feature_set_prepared, target_set, test_size=test_size, shuffle=shuffle)
    
    return x_train, x_test, y_train, y_test