# Basic Usage of ML Project Starter

This notebook demonstrates the basic usage of our modular ML project structure using a simple classification example with the Iris dataset.

## 1. Setup and Imports

In [None]:
# Import required libraries
import sys
sys.path.append('..')
from src.preprocess import DataPreprocessor
from src.model import MLModel
from src.evaluate import ModelEvaluator
from utils.helpers import validate_dataframe, safe_file_path, remove_outliers

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

## 2. Load and Prepare Data

In [None]:
# Load Iris dataset
iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['target'] = iris.target

# Validate the dataset
required_cols = iris.feature_names + ['target']
numeric_cols = iris.feature_names
is_valid, message = validate_dataframe(data, required_cols, numeric_cols)
print('Data validation:', message)

print('Dataset shape:', data.shape)
print('\nFeature names:', iris.feature_names)
print('\nFirst few rows:')
print(data.head())

Data validation: Validation successful
Dataset shape: (150, 5)

Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

First few rows:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


## 3. Preprocess Data

In [3]:
# Check for and remove outliers
data_clean = remove_outliers(data, iris.feature_names, n_std=3.0)
print('Original data shape:', data.shape)
print('Data shape after outlier removal:', data_clean.shape)

# Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Initialize preprocessor
preprocessor = DataPreprocessor()

# Preprocess training data
X_train, y_train = preprocessor.preprocess_data(train_data, target_column='target')

# Preprocess test data
X_test, y_test = preprocessor.preprocess_data(test_data, target_column='target')

print('Training set shape:', X_train.shape)
print('Test set shape:', X_test.shape)

Original data shape: (150, 5)
Data shape after outlier removal: (149, 5)
Training set shape: (120, 4)
Test set shape: (30, 4)


  df_clean = df_clean[z_scores < n_std]
  df_clean = df_clean[z_scores < n_std]


## 4. Train Model

In [4]:
# Initialize and train model
model = MLModel(model_params={
    'n_estimators': 100,
    'max_depth': 5,
    'random_state': 42
})

model.train(X_train, y_train)

## 5. Make Predictions and Evaluate

In [5]:
# Make predictions
predictions = model.predict(X_test)

# Evaluate model
evaluator = ModelEvaluator()
results = evaluator.evaluate_model(y_test, predictions)

print('Model Accuracy:', results['accuracy'])
print('\nClassification Report:')
print(results['classification_report'])

Model Accuracy: 0.9666666666666667

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.90      1.00      0.95         9
           2       1.00      0.91      0.95        11

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



## 5. Save and Load Model

Demonstrate model persistence.

In [6]:
# Create a safe path for model saving\
model_path = safe_file_path('../data/models/iris_model.joblib')

# Save the model
model.save_model(model_path)
print(f'Model saved to: {model_path}')

# Load the model
new_model = MLModel()
new_model.load_model('../data/iris_model.joblib')

# Verify the loaded model works
new_predictions = new_model.predict(X_test)
print("Loaded Model Accuracy:", evaluator.evaluate_model(y_test, new_predictions)['accuracy'])

Model saved to: ../data/models/iris_model.joblib
Loaded Model Accuracy: 0.9666666666666667
