# Churn Prediction

In [1]:
import kfp
from kfp import dsl
from kfp.dsl import InputPath, OutputPath, pipeline, component


# Define the data download component
@component(
    base_image="python:3.11",
    packages_to_install=["pandas==2.2.3", "requests==2.32.3", "minio==7.2.15"]
)
def download_operation(url: str, data_path: OutputPath('Dataset')) -> None:
    import requests
    import pandas as pd

    if url != '':
        # Pull object from MINIO
        response = requests.get(url)
        response.raise_for_status()
        from io import StringIO
        df = pd.read_csv(StringIO(response.text), header=0, sep=";")
    else:        
        from minio import Minio
        # MinIO client setup
        minio_client = Minio(
            endpoint="192.168.203.181:30900", 
            access_key="minioadmin", 
            secret_key="minioadmin", 
            secure=False
        )    
        # Get the object and save locally
        minio_client.fget_object(
            bucket_name="datasets", 
            object_name="customer_churn_dataset-testing-copy.csv",
            file_path="/tmp/dataset.csv"
        )    
        # Use the downloaded CSV file for data ingestion
        df = pd.read_csv("/tmp/dataset.csv")

    if not df.empty:
        print("Data Description => ",df.describe())
        df.to_csv(data_path, index=False)

# Define the data ingestion component
@component(
    base_image="python:3.11",
    packages_to_install=["pandas==2.2.3"]
)
def ingestion_operation(input_csv: InputPath('Dataset'), output_csv: OutputPath('Dataset')) -> None:
        import pandas as pd
        # Simulate loading data from CSV file located at a known path
        # input_csv='customer_churn_dataset-testing-master.csv'
        df = pd.read_csv(input_csv)
        df.describe()
        df.to_csv(output_csv, index=False)

# Define the data processing component
@component(
    base_image="python:3.11",
    packages_to_install=["pandas==2.2.3", "scikit-learn==1.6.1"]
)
def processing_operation(input_csv: InputPath('Dataset'), processed_X: OutputPath('Dataset'), processed_y: OutputPath('Dataset')) -> None:
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    from sklearn.feature_selection import SelectKBest, f_classif
    from sklearn.model_selection import train_test_split

    df = pd.read_csv(input_csv)
    print("__________Data Information__________")
    print(df.info())
    print("__________Contract Length__________")
    print(df["Contract Length"].value_counts())

    # Feature selection and standardization
    gender_map = {'Male': 0, 'Female': 1}
    subscription_map = {'Basic': 0, 'Premium': 1, 'Pro': 2}
    Contract_Length = {'Annual': 0, 'Quarterly': 1, 'Monthly' : 2} 
    
    df['Gender'] = df['Gender'].map(gender_map)
    df['Subscription Type'] = df['Subscription Type'].map(subscription_map)
    df['Contract Length'] = df['Contract Length'].map(Contract_Length)

    # Fill NaN values with the mode for each column
    for column in df.columns:
        df[column].fillna(df[column].mode()[0], inplace=True)

    threshold = 0.03
    correlation_matrix = df.corr()
    high_corr_features = correlation_matrix.index[abs(correlation_matrix["Churn"]) > threshold].tolist()
    high_corr_features.remove("Churn")    
    print("__________High Correlated Features__________")
    print(high_corr_features)
    
    X_selected = df[high_corr_features]
    y_selected = df["Churn"]

    print("__________X-Data Information__________")
    print(pd.DataFrame(X_selected).info())
    print("__________Y-Data Information__________")
    print(pd.Series(y_selected).info())

    # # Select features
    # selector = SelectKBest(score_func=f_classif, k=10)
    # X_selected = selector.fit_transform(X, y)

    # # Standardization
    # scaler = StandardScaler()
    # X_scaled = scaler.fit_transform(X_selected)

    # # Save the processed data to files
    # pd.DataFrame(X_scaled).to_csv(processed_X, index=False)
    # pd.Series(y).to_csv(processed_y, index=False)

    #Save the processed data to files    
    pd.DataFrame(X_selected).to_csv(processed_X, index=False)
    pd.Series(y_selected).to_csv(processed_y, index=False)
   

# Define the model training component
@component(
    base_image="python:3.11",
    packages_to_install=["pandas==2.2.3", "scikit-learn==1.6.1", "joblib==1.4.2"]
)
def training_operation(processed_X: InputPath('Dataset'), processed_y: InputPath('Dataset'), 
                      knn_model: OutputPath('Dataset'), lg_model: OutputPath('Dataset'), svm_model: OutputPath('Dataset')) -> None:
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier    
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
    import joblib

    X_processed = pd.read_csv(processed_X)
    y_processed = pd.read_csv(processed_y)

    X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size= 0.2 , shuffle=True, random_state=55)
    
    print("__________X-Training Data Information__________")
    print(pd.DataFrame(X_train).info())
    print("__________Y-Training Data Information__________")
    print(pd.DataFrame(y_train).info())
    
    #KNN Model 
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_knn)
    print("__________KNN Accuracy Score__________")
    print(f'Accuracy: {accuracy * 100:.2f}%')    
    print("__________KNN Classification Report__________")
    print(classification_report(y_test, y_pred_knn))
    # Save the trained model
    joblib.dump(knn, knn_model)

    #Logistic Regression
    lg = LogisticRegression(max_iter=1000, random_state=42)
    lg.fit(X_train, y_train)
    y_pred_lg = lg.predict(X_test)
    print("__________LR Accuracy Score__________")
    print(accuracy_score(y_test, y_pred_lg))
    # Save the trained model
    joblib.dump(lg, lg_model)

    #SVM 
    svm = SVC()
    svm.fit(X_train, y_train)
    y_pred_svm = svm.predict(X_test)
    print("__________SVM Accuracy Score__________")
    print(accuracy_score(y_test, y_pred_svm))
    # Save the trained model
    joblib.dump(svm, svm_model)
    

# Define the pipeline
@pipeline(
    name='Customer Churn Prediction Pipeline',
    description='A pipeline to perform customer churn prediction.'
)
def churn_prediction_pipeline(url: str):
    # Step 1
    download = download_operation(url=url)
    print("Step1 => Downloaded: ", download.outputs['data_path'])
    
    # Step 2
    ingest = ingestion_operation(input_csv=download.outputs['data_path'])
    print("Step2 => Ingested: ", ingest.outputs['output_csv'])
    
    # Step 3
    process = processing_operation(input_csv=ingest.outputs['output_csv'])
    print("Step3 => Processed X: ", process.outputs['processed_X'], "\n\t Processed y: ",  process.outputs['processed_y'])
    
    # Step 4
    train = training_operation(processed_X=process.outputs['processed_X'], processed_y=process.outputs['processed_y'])    
    print("Step4 => Trained Model KNN: ", train.outputs['knn_model'], "\n\t Trained Model LG: ",  
          train.outputs['lg_model'], "\n\t Trained Model SVM: ",  train.outputs['svm_model'] )


# Compile and run the pipeline
if __name__ == '__main__':
    # Compile the pipeline into a package
    kfp.compiler.Compiler().compile(churn_prediction_pipeline, 'churn_prediction_pipeline.yaml')
    
    # Connect to Kubeflow Pipelines and execute the pipeline
    client = kfp.Client()
    url = ''
    client.create_run_from_pipeline_func(churn_prediction_pipeline, arguments={'url': url}, enable_caching=False)

Step1 => Downloaded:  {{channel:task=download-operation;name=data_path;type=system.Dataset@0.0.1;}}
Step2 => Ingested:  {{channel:task=ingestion-operation;name=output_csv;type=system.Dataset@0.0.1;}}
Step3 => Processed X:  {{channel:task=processing-operation;name=processed_X;type=system.Dataset@0.0.1;}} 
	 Processed y:  {{channel:task=processing-operation;name=processed_y;type=system.Dataset@0.0.1;}}
Step4 => Trained Model KNN:  {{channel:task=training-operation;name=knn_model;type=system.Dataset@0.0.1;}} 
	 Trained Model LG:  {{channel:task=training-operation;name=lg_model;type=system.Dataset@0.0.1;}} 
	 Trained Model SVM:  {{channel:task=training-operation;name=svm_model;type=system.Dataset@0.0.1;}}


