## Data preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import IsolationForest

# Handling missing values by replacing them with the mean value
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Detecting outliers using IsolationForest
iso_forest = IsolationForest(contamination=0.1)
outliers = iso_forest.fit_predict(X_imputed)

# Scaling features to have zero mean and unit variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)


## Feature engineering

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Generating interaction terms to capture non-linear relationships
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X_scaled)

## Model development and optimization

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Baseline logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Advanced models for improved performance
dt_clf = DecisionTreeClassifier()            # Decision Tree
rf_clf = RandomForestClassifier()            # Random Forest
gb_clf = GradientBoostingClassifier()        # Gradient Boosting
svm_clf = SVC()                              # Support Vector Machine
mlp_clf = MLPClassifier()                    # Neural Network

# Hyperparameter tuning for Random Forest using GridSearchCV
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}
grid_search = GridSearchCV(rf_clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Ensuring robust model evaluation using StratifiedKFold cross-validation
skf = StratifiedKFold(n_splits=10)


# Optimization of model
# Making predictions and evaluating the model
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

## Implementation and deployment

In [None]:
from azureml.core import Workspace, Model
from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig

# Connect to Duke ADF_ML workspace
ws = Workspace.from_config()

# Register the model
model = Model.register(workspace=ws,
                       model_path='credit_risk_pipeline.pkl',  # Path to the file
                       model_name='credit_risk_pipeline')     # Name of the model

# Define the inference configuration
inference_config = InferenceConfig(entry_script='score.py',  # Script that contains scoring logic
                                   environment=myenv)          # Environment containing dependencies

# Define the deployment configuration
aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

# Deploy the model
service = Model.deploy(workspace=ws,
                       name='credit-risk-service',
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=aci_config)

service.wait_for_deployment(show_output=True)