# Churn Prediction

In [1]:
import kfp
from kfp import dsl
from kfp.dsl import InputPath, OutputPath, pipeline, component

# DOWNLOAD DATA FROM APIs, SQL and CSV (via Minio)
@component(
    base_image="python:3.11",
    packages_to_install=["pandas==2.2.3", "requests==2.32.3", "minio==7.2.15", "sqlalchemy==2.0.38", "pymysql==1.1.1"]
)
def data_ingestion(input_csv: str, api_endpoint: str, sql_details: dict, output_csv: OutputPath('Dataset')) -> None:
    import requests
    import pandas as pd
    from sqlalchemy import create_engine
    from minio import Minio
    from io import StringIO
    
    # Starting with an empty DataFrame
    df = pd.DataFrame()  

    # From API
    if api_endpoint:          
        # Send a GET Request and handle Error
        try:
            response = requests.get(api_endpoint) 
        except Exception as e:
            print(f"An error occurred while sending GET request to {api_endpoint}: {e}")
            
        # If there's a response, then load it into Datafrom    
        if response:  
            response.raise_for_status()
            df_api = pd.DataFrame()
            df_api = pd.DataFrame(response.json())
            if not df_api.empty:
                print("Data from API => ", df_api.describe())
                # Merge all Dataframes into one
                df = df_api if df.empty else pd.merge(df, df_api, on='key', how='inner') 
                

    # From SQL
    if sql_details: 
        df_db = pd.DataFrame()        
        # Create a SQL engine for connection
        engine = create_engine(sql_details['connection_string'], 
                               pool_size=10,
                               max_overflow=20, 
                               pool_timeout=30, 
                               pool_recycle=1800, 
                               pool_pre_ping=True)
        
        # Run query to capture the data and handle Errors
        try:
            df_db = pd.read_sql(sql_details['query'], engine)
        except Exception as e:
            print(f"An error occurred while querying the Database: {e}") 
            
        if not df_db.empty:
            print("Data from Database => ", df_db.describe())
            # Merge all Dataframes into one
            df = df_db if df.empty else pd.merge(df, df_db, on='key', how='inner') 
            
            
            
    # From CSV file (via Minio)
    if input_csv:
        df_minio = pd.DataFrame()
        downloaded_file = ''
        
        # Create a minio client connection
        minio_client = Minio(endpoint="192.168.203.181:30900",
                             access_key="minioadmin",
                             secret_key="minioadmin",
                             secure=False)        
        
        # Download the Object and handle Errors
        try:
            minio_client.fget_object(bucket_name="datasets", 
                                     object_name=input_csv,
                                     file_path='/tmp/dataset.csv')
            downloaded_file = '/tmp/dataset.csv'
        except Exception as e:
                print(f"An error occurred while downloading the file {input_csv}: {e}")
            
        # If the CSV file is downloaded then load it into Datafrom    
        if downloaded_file:   
            df_minio = pd.read_csv(downloaded_file)
            if not df_minio.empty:
                print("Data from Minio => ", df_minio.describe())
                # Merge all Dataframes into one
                df = df_minio if df.empty else pd.merge(df, df_minio, on='key', how='inner')
                

    if not df.empty:
        df.reset_index(drop=True, inplace=True)
        df.to_csv(output_csv, index=False)
        print("Downloaded Data => ", df.describe())
        

# Define the data processing component
@component(
    base_image="python:3.11",
    packages_to_install=["pandas==2.2.3", "scikit-learn==1.6.1"]
)
def data_processing(input_csv: InputPath('Dataset'), processed_X: OutputPath('Dataset'), processed_y: OutputPath('Dataset')) -> None:
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    from sklearn.feature_selection import SelectKBest, f_classif
    from sklearn.model_selection import train_test_split

    df = pd.read_csv(input_csv)
    print("__________Data Information__________")
    print(df.info())
    print("__________Contract Length__________")
    print(df["Contract Length"].value_counts())

    # Feature selection and standardization
    gender_map = {'Male': 0, 'Female': 1}
    subscription_map = {'Basic': 0, 'Premium': 1, 'Pro': 2}
    Contract_Length = {'Annual': 0, 'Quarterly': 1, 'Monthly' : 2} 
    
    df['Gender'] = df['Gender'].map(gender_map)
    df['Subscription Type'] = df['Subscription Type'].map(subscription_map)
    df['Contract Length'] = df['Contract Length'].map(Contract_Length)

    # Fill NaN values with the mode for each column
    for column in df.columns:
        df[column].fillna(df[column].mode()[0], inplace=True)

    threshold = 0.03
    correlation_matrix = df.corr()
    high_corr_features = correlation_matrix.index[abs(correlation_matrix["Churn"]) > threshold].tolist()
    high_corr_features.remove("Churn")    
    print("__________High Correlated Features__________")
    print(high_corr_features)
    
    X_selected = df[high_corr_features]
    y_selected = df["Churn"]

    print("__________X-Data Information__________")
    print(pd.DataFrame(X_selected).info())
    print("__________Y-Data Information__________")
    print(pd.Series(y_selected).info())

    # # Select features
    # selector = SelectKBest(score_func=f_classif, k=10)
    # X_selected = selector.fit_transform(X, y)

    # # Standardization
    # scaler = StandardScaler()
    # X_scaled = scaler.fit_transform(X_selected)

    # # Save the processed data to files
    # pd.DataFrame(X_scaled).to_csv(processed_X, index=False)
    # pd.Series(y).to_csv(processed_y, index=False)

    #Save the processed data to files    
    pd.DataFrame(X_selected).to_csv(processed_X, index=False)
    pd.Series(y_selected).to_csv(processed_y, index=False)
   

# Define the model training component
@component(
    base_image="python:3.11",
    packages_to_install=["pandas==2.2.3", "scikit-learn==1.6.1", "joblib==1.4.2"]
)
def model_training(processed_X: InputPath('Dataset'), processed_y: InputPath('Dataset'), 
                      knn_model: OutputPath('Dataset'), lg_model: OutputPath('Dataset'), svm_model: OutputPath('Dataset')) -> None:
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier    
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
    import joblib

    X_processed = pd.read_csv(processed_X)
    y_processed = pd.read_csv(processed_y)

    X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size= 0.2 , shuffle=True, random_state=55)
    
    print("__________X-Training Data Information__________")
    print(pd.DataFrame(X_train).info())
    print("__________Y-Training Data Information__________")
    print(pd.DataFrame(y_train).info())
    
    #KNN Model 
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_knn)
    print("__________KNN Accuracy Score__________")
    print(f'Accuracy: {accuracy * 100:.2f}%')    
    print("__________KNN Classification Report__________")
    print(classification_report(y_test, y_pred_knn))
    # Save the trained model
    joblib.dump(knn, knn_model)

    #Logistic Regression
    lg = LogisticRegression(max_iter=1000, random_state=42)
    lg.fit(X_train, y_train)
    y_pred_lg = lg.predict(X_test)
    print("__________LR Accuracy Score__________")
    print(accuracy_score(y_test, y_pred_lg))
    # Save the trained model
    joblib.dump(lg, lg_model)

    #SVM 
    svm = SVC()
    svm.fit(X_train, y_train)
    y_pred_svm = svm.predict(X_test)
    print("__________SVM Accuracy Score__________")
    print(accuracy_score(y_test, y_pred_svm))
    # Save the trained model
    joblib.dump(svm, svm_model)
    

# Define the pipeline
@pipeline(
    name='Customer Churn Prediction Pipeline',
    description='A pipeline to perform customer churn prediction.'
)
def churn_prediction_pipeline(input_csv: str, api_endpoint: str, sql_details: dict):
    # Step 1
    ingest = data_ingestion(input_csv=input_csv, api_endpoint=api_endpoint, sql_details=sql_details)
    print("Step1 => Ingested: ", ingest.outputs['output_csv'])
    
    # Step 2
    process = data_processing(input_csv=ingest.outputs['output_csv'])
    print("Step2 => Processed X: ", process.outputs['processed_X'], "\n\t Processed y: ",  process.outputs['processed_y'])
    
    # Step 3
    train = model_training(processed_X=process.outputs['processed_X'], processed_y=process.outputs['processed_y'])    
    print("Step3 => Trained Model KNN: ", train.outputs['knn_model'], "\n\t Trained Model LG: ",  
          train.outputs['lg_model'], "\n\t Trained Model SVM: ",  train.outputs['svm_model'] )


# Compile and run the pipeline
if __name__ == '__main__':
    # Compile the pipeline into a package
    kfp.compiler.Compiler().compile(churn_prediction_pipeline, 'churn_prediction_pipeline.yaml')
    
    # Connect to Kubeflow Pipelines and execute the pipeline
    client = kfp.Client()
    api_endpoint = ''
    input_csv = 'customer_churn_dataset-testing-copy.csv'
    sql_params = {'connection_string': 'mysql+pymysql://app:TOwVvKU9yVsFj4xkaoLoEpKwmGso5GHkMLh9RRO32ma0xMNhKBR2THGUlwg68Yxd@192.168.203.181:30543/fin-db', 'query': 'SELECT * FROM accounts LIMIT 100'}
    client.create_run_from_pipeline_func(churn_prediction_pipeline, arguments={'input_csv':input_csv, 'api_endpoint':api_endpoint, 'sql_details': sql_params}, enable_caching=False)

Step1 => Ingested:  {{channel:task=data-ingestion;name=output_csv;type=system.Dataset@0.0.1;}}
Step2 => Processed X:  {{channel:task=data-processing;name=processed_X;type=system.Dataset@0.0.1;}} 
	 Processed y:  {{channel:task=data-processing;name=processed_y;type=system.Dataset@0.0.1;}}
Step3 => Trained Model KNN:  {{channel:task=model-training;name=knn_model;type=system.Dataset@0.0.1;}} 
	 Trained Model LG:  {{channel:task=model-training;name=lg_model;type=system.Dataset@0.0.1;}} 
	 Trained Model SVM:  {{channel:task=model-training;name=svm_model;type=system.Dataset@0.0.1;}}


