# Import necessary libraries

In [1]:
import kfp
from kfp import dsl
from kfp.components import func_to_container_op
import pandas as pd
import requests
import boto3
import sqlite3
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from flask import Flask, request, jsonify
import pickle
from feast import FeatureStore

ModuleNotFoundError: No module named 'kfp'

In [None]:

# 1. Data Ingestion
@func_to_container_op
def fetch_transaction_data():
    return pd.read_csv("transactions.csv")

@func_to_container_op
def fetch_customer_data():
    api_url = "https://api.example.com/customers"
    response = requests.get(api_url)
    return response.json()

# 2. Raw Data Storage
@func_to_container_op
def upload_to_s3(transactions, customers):
    s3 = boto3.client('s3')
    bucket_name = "customer-churn-data"

    # Save ingested data locally first
    transactions.to_csv("raw_transactions.csv", index=False)
    with open("raw_customer_data.json", "w") as f:
        f.write(str(customers))

    # Upload files to S3
    s3.upload_file("raw_transactions.csv", bucket_name, "raw/transactions.csv")
    s3.upload_file("raw_customer_data.json", bucket_name, "raw/customer_data.json")

    print("Files uploaded successfully to S3!")

# 3. Data Validation
@func_to_container_op
def validate_data():
    df = pd.read_csv("raw_transactions.csv")

    # Check for missing values
    missing_values = df.isnull().sum()

    # Validate column data types
    expected_types = {'customer_id': int, 'transaction_amount': float, 'transaction_date': str}
    for col, expected_type in expected_types.items():
        if df[col].dtype != expected_type:
            print(f"Column {col} has incorrect data type")

    print("Data validation completed!")

# 4. Data Preparation
@func_to_container_op
def prepare_data():
    df = pd.read_csv("raw_transactions.csv")

    # Handle missing values
    df.fillna(df.mean(), inplace=True)

    # Convert date column to datetime
    df["transaction_date"] = pd.to_datetime(df["transaction_date"])

    # Feature Engineering: Total spend per customer
    customer_spend = df.groupby("customer_id")["transaction_amount"].sum().reset_index()
    customer_spend.to_csv("cleaned_data.csv", index=False)

    print("Data preparation completed!")

# 5. Data Transformation and Storage
@func_to_container_op
def transform_and_store_data():
    conn = sqlite3.connect("customer_churn.db")
    cursor = conn.cursor()

    cursor.execute("""
    CREATE TABLE IF NOT EXISTS transformed_data (
        customer_id INT PRIMARY KEY,
        total_spend FLOAT,
        last_transaction_date DATE
    );
    """)

    # Insert Transformed Data
    cursor.execute("INSERT INTO transformed_data (customer_id, total_spend, last_transaction_date) VALUES (?, ?, ?)",
                   (101, 250.50, "2023-12-01"))

    conn.commit()
    conn.close()
    print("Data inserted successfully!")

# 6. Feature Store
@func_to_container_op
def manage_feature_store():
    store = FeatureStore(repo_path="feature_repo")

    # Retrieve features for model training
    customer_features = store.get_online_features(
        entity_rows=[{"customer_id": 101}],
        features=["customer.total_spend", "customer.last_transaction_date"]
    ).to_dict()

    print(customer_features)

# 7. Model Building
@func_to_container_op
def build_model():
    df = pd.read_csv("cleaned_data.csv")

    # Assume a churn label column exists
    X = df[["total_spend"]]
    y = df["churn_label"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Model Accuracy:", accuracy_score(y_test, y_pred))

    # Save the model
    with open("churn_model.pkl", "wb") as f:
        pickle.dump(model, f)

# 8. Model Deployment
@func_to_container_op
def deploy_model():
    app = Flask(__name__)
    model = pickle.load(open("churn_model.pkl", "rb"))

    @app.route("/predict", methods=["POST"])
    def predict():
        data = request.json
        prediction = model.predict([[data["total_spend"]]])
        return jsonify({"churn_prediction": int(prediction[0])})

    if __name__ == "__main__":
        app.run(debug=True)

# Define the Kubeflow Pipeline
@dsl.pipeline(
    name="Customer Churn Pipeline",
    description="An end-to-end pipeline for predicting customer churn."
)
def customer_churn_pipeline():
    # Step 1: Data Ingestion
    transactions_task = fetch_transaction_data()
    customers_task = fetch_customer_data()

    # Step 2: Upload to S3
    upload_task = upload_to_s3(transactions_task.output, customers_task.output)

    # Step 3: Data Validation
    validation_task = validate_data().after(upload_task)

    # Step 4: Data Preparation
    preparation_task = prepare_data().after(validation_task)

    # Step 5: Data Transformation and Storage
    transform_task = transform_and_store_data().after(preparation_task)

    # Step 6: Feature Store
    feature_store_task = manage_feature_store().after(transform_task)

    # Step 7: Model Building
    model_building_task = build_model().after(feature_store_task)

    # Step 8: Model Deployment
    deploy_task = deploy_model().after(model_building_task)

# Compile and run the pipeline
if __name__ == "__main__":
    kfp.compiler.Compiler().compile(customer_churn_pipeline, "customer_churn_pipeline.yaml")
    print("Pipeline compiled successfully!")