In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
import mlflow
import joblib

In [3]:
mlflow.set_experiment('Test')

2024/08/16 10:10:07 INFO mlflow.tracking.fluent: Experiment with name 'Test' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/ashleyalexjacob/Data_Science/training/cicd_ml/mlruns/341001679218459739', creation_time=1723785007424, experiment_id='341001679218459739', last_update_time=1723785007424, lifecycle_stage='active', name='Test', tags={}>

In [4]:
df = pd.read_csv("data/bank.csv", sep=";")

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [6]:
yes = df[df['y']=='yes']
yes.head(1)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
13,20,student,single,secondary,no,502,no,no,cellular,30,apr,261,1,-1,0,unknown,yes


In [7]:
yes.iloc[0].to_json()

'{"age":20,"job":"student","marital":"single","education":"secondary","default":"no","balance":502,"housing":"no","loan":"no","contact":"cellular","day":30,"month":"apr","duration":261,"campaign":1,"pdays":-1,"previous":0,"poutcome":"unknown","y":"yes"}'

In [8]:
X = df.drop("y", axis=1)
y = df["y"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

In [14]:
with mlflow.start_run():
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ]
    )
    model = RandomForestClassifier(random_state=42)
    pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    precision = precision_score(y_test, y_pred, pos_label='yes')
    recall = recall_score(y_test, y_pred, pos_label='yes')
    mlflow.log_metric("accuracy", str(accuracy_score(y_test, y_pred)))
    mlflow.log_metric("precision", str(precision))
    mlflow.log_metric("recall", str(recall))
    print(classification_report(y_test, y_pred))
    model_filename = 'model_v1.0.pkl'
    joblib.dump(pipeline, model_filename)
    print(f"Model saved to {model_filename}")
    mlflow.sklearn.log_model(pipeline, "pipeline_random_forest")

Accuracy: 0.8968312453942521
              precision    recall  f1-score   support

          no       0.91      0.98      0.94      1205
         yes       0.61      0.22      0.32       152

    accuracy                           0.90      1357
   macro avg       0.76      0.60      0.63      1357
weighted avg       0.88      0.90      0.87      1357

Model saved to model_v1.0.pkl




In [15]:
import mlflow.sklearn

# Load model
model = mlflow.sklearn.load_model("runs:/ec6f197bc6e44348a0653256bb622bcf/pipeline_random_forest")

# Use the loaded model
predictions = model.predict(X_test)

In [16]:
predictions

array(['no', 'no', 'no', ..., 'no', 'no', 'no'], dtype=object)

In [15]:
import joblib

# Train the model (continuation from the previous code)
pipeline.fit(X_train, y_train)

# Save the model to a file


Model saved to model_v1.0.pkl


In [16]:
# Load the model from the file
loaded_model = joblib.load(model_filename)

# Sample single input data for prediction (must be in the same format as training data)
single_input = {
    "age": 35,
    "job": "services",
    "marital": "single",
    "education": "secondary",
    "default": "no",
    "balance": 1500,
    "housing": "yes",
    "loan": "no",
    "contact": "cellular",
    "day": 15,
    "month": "jul",
    "duration": 180,
    "campaign": 2,
    "pdays": -1,
    "previous": 0,
    "poutcome": "unknown"
}

# Convert single input to DataFrame to match the training format
single_input_df = pd.DataFrame([single_input])

# Make a prediction
single_prediction = loaded_model.predict(single_input_df)

# Output the prediction
print(f"Prediction for the input data: {single_prediction[0]}")

Prediction for the input data: no
