In [None]:
new_data = pd.read_csv('new_customer_data.csv')  # Load new data from CSV file

In [None]:
def raw_preprocessor(df):
    df = df.copy()
    if 'customerID' in df.columns:
        df = df.drop('customerID', axis=1)
    if 'TotalCharges' in df.columns:
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
        df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)    
    if 'SeniorCitizen' in df.columns:   
        df['SeniorCitizen'] = df['SeniorCitizen'].replace({0: 'No', 1: 'Yes'})
    # if 'Churn' in df.columns:
    #         df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})
    if 'tenure' in df.columns:
        bins = [0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72]
        labels = ['0-0.5 Year', '0.5-1 Year', '1-1.5 Years', '1.5-2 Years', '2-2.5 Years', '2.5-3 Years', '3-3.5 Years', '3.5-4 Years', '4-4.5 Years', '4.5-5 Years', '5-5.5 Years', '5.5-6 Years']
        df['TenureGroup'] = pd.cut(df['tenure'], bins=bins, labels=labels, right=False)
        df.drop('tenure', axis=1, inplace=True)
    return df

raw_preprocessor = FunctionTransformer(raw_preprocessor)

In [None]:
cat_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

num_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

log_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    FunctionTransformer(np.log1p, feature_names_out="one-to-one"),
    StandardScaler()
)

In [None]:
cat_features = train_X.select_dtypes(include=['object']).columns.tolist()
num_features = train_X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer([
    ("cat", cat_pipeline, cat_features),
    ("log", log_pipeline, ['MonthlyCharges', 'TotalCharges'])
])

In [None]:
model = best_model  # Assume best_model is defined elsewhere

In [None]:
classification_pipeline = Pipeline([
    ("raw_fix", raw_preprocessor),
    ("preprocessor", preprocessor),
    ("classifier", xbgb_clf)
])

In [None]:
classification_pipeline.predict(new_data)