In [None]:
from azureml.core import Workspace, Dataset
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import joblib

# Initialize workspace
ws = Workspace.from_config()
datastore = ws.get_default_datastore()

# Load features
data_ref = [(datastore, 'clv-data/customer_features.csv')]
dataset = Dataset.Tabular.from_delimited_files(data_ref)
customer_data = dataset.to_pandas_dataframe()

# Prepare data
X = customer_data[['Recency', 'Frequency', 'Monetary']]
y = customer_data['CLV']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Evaluate
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Train R²: {train_score:.4f}, Test R²: {test_score:.4f}")

# Save model
joblib.dump(model, 'clv_model.pkl')
datastore.upload_files(files=['clv_model.pkl'], target_path='clv-models/', overwrite=True)
print("Model training complete!")