In [None]:
import pandas as pd
import requests
import boto3
import sqlite3
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from flask import Flask, request, jsonify
import pickle
from feast import FeatureStore
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime

# Import necessary libraries

In [None]:



# 1. Data Ingestion
def fetch_transaction_data():
    return pd.read_csv("transactions.csv")

def fetch_customer_data():
    api_url = "https://api.example.com/customers"
    response = requests.get(api_url)
    return response.json()

transactions = fetch_transaction_data()
customers = fetch_customer_data()

# Save ingested data
transactions.to_csv("raw_transactions.csv", index=False)
with open("raw_customer_data.json", "w") as f:
    f.write(str(customers))

print("Data ingestion completed successfully!")

# 2. Raw Data Storage
s3 = boto3.client('s3')
bucket_name = "customer-churn-data"

# Upload files
s3.upload_file("raw_transactions.csv", bucket_name, "raw/transactions.csv")
s3.upload_file("raw_customer_data.json", bucket_name, "raw/customer_data.json")

print("Files uploaded successfully to S3!")

# 3. Data Validation
df = pd.read_csv("raw_transactions.csv")

# Check for missing values
missing_values = df.isnull().sum()

# Validate column data types
expected_types = {'customer_id': int, 'transaction_amount': float, 'transaction_date': str}
for col, expected_type in expected_types.items():
    if df[col].dtype != expected_type:
        print(f"Column {col} has incorrect data type")

print("Data validation completed!")

# 4. Data Preparation
df = pd.read_csv("raw_transactions.csv")

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Convert date column to datetime
df["transaction_date"] = pd.to_datetime(df["transaction_date"])

# Feature Engineering: Total spend per customer
customer_spend = df.groupby("customer_id")["transaction_amount"].sum().reset_index()
customer_spend.to_csv("cleaned_data.csv", index=False)

print("Data preparation completed!")

# 5. Data Transformation and Storage
# SQL Schema for Storing Transformed Data
conn = sqlite3.connect("customer_churn.db")
cursor = conn.cursor()

cursor.execute("""
CREATE TABLE IF NOT EXISTS transformed_data (
    customer_id INT PRIMARY KEY,
    total_spend FLOAT,
    last_transaction_date DATE
);
""")

# Insert Transformed Data
cursor.execute("INSERT INTO transformed_data (customer_id, total_spend, last_transaction_date) VALUES (?, ?, ?)",
               (101, 250.50, "2023-12-01"))

conn.commit()
conn.close()
print("Data inserted successfully!")

# 6. Feature Store
# Using Feast for Feature Management
store = FeatureStore(repo_path="feature_repo")

# Retrieve features for model training
customer_features = store.get_online_features(
    entity_rows=[{"customer_id": 101}],
    features=["customer.total_spend", "customer.last_transaction_date"]
).to_dict()

print(customer_features)

# 7. Data Versioning
# Using DVC for Data Versioning
# Run these commands in a terminal or Jupyter Notebook cell
# !dvc init
# !dvc add cleaned_data.csv
# !git add cleaned_data.csv.dvc
# !git commit -m "Added versioned cleaned data"

# 8. Model Building
df = pd.read_csv("cleaned_data.csv")

# Assume a churn label column exists
X = df[["total_spend"]]
y = df["churn_label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))

# Save the model
with open("churn_model.pkl", "wb") as f:
    pickle.dump(model, f)

# 9. Pipeline Orchestration
# Automating with Apache Airflow
def ingest_data():
    print("Ingesting Data...")

dag = DAG("customer_churn_pipeline", start_date=datetime(2023, 1, 1), schedule_interval="@daily")

ingest_task = PythonOperator(
    task_id="ingest_data",
    python_callable=ingest_data,
    dag=dag
)

ingest_task

# 10. Model Deployment
# Deploying Model using Flask API
app = Flask(__name__)
model = pickle.load(open("churn_model.pkl", "rb"))

@app.route("/predict", methods=["POST"])
def predict():
    data = request.json
    prediction = model.predict([[data["total_spend"]]])
    return jsonify({"churn_prediction": int(prediction[0])})

if __name__ == "__main__":
    app.run(debug=True)