In [2]:
# 1.	Version Naming Convention:
# o	Use semantic versioning: model-v1.0, model-v2.1.
# o	Include context in the version (e.g., model-2025-01-v1).
# 2.	Storage:
# o	Save models with metadata using tools like:
# 	MLflow: Tracks experiments, models, and associated metrics.
# 	DVC: Tracks model files, datasets, and pipelines.
# 	Databases/File Systems: Store as .pkl or .h5 files.

from sklearn.ensemble import RandomForestClassifier
import joblib

model = RandomForestClassifier()
joblib.dump(model, "model-v1.pkl")


['model-v1.pkl']

In [3]:
# 3.	Metadata Tracking:
# o	Store metadata such as hyperparameters, metrics, and dataset versions.

metadata = {
    "version": "v1.0",
    "date": "2025-01-01",
    "hyperparameters": {"learning_rate": 0.01, "n_estimators": 100},
    "metrics": {"accuracy": 0.95, "f1_score": 0.92}
}


In [4]:
# Searches exhaustively over a range of specified hyperparameter values.

import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import load_iris

# Load datasets (train.csv, test.csv, etc.) into Pandas dataframes.

train = pd.read_csv("D:/Personal/Kifiya 10 Academy/Week 4/data/Data/rossmann-store-sales/train.csv")
# test = pd.read_csv("D:/Personal/Kifiya 10 Academy/Week 4/data/Data/rossmann-store-sales/test.csv")
test = pd.read_csv("D:/Personal/Kifiya 10 Academy/Week 4/data/Data/rossmann-store-sales/test.csv", dtype={'StateHoliday': str})

# Convert 'Date' column to datetime
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

# Extract useful features from 'Date'
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Day'] = train['Date'].dt.day

test['Year'] = test['Date'].dt.year
test['Month'] = test['Date'].dt.month
test['Day'] = test['Date'].dt.day

# Encode categorical variables
train = pd.get_dummies(train, columns=['StateHoliday'], drop_first=True)
test = pd.get_dummies(test, columns=['StateHoliday'], drop_first=True)

# Define features and target variable
X = train.drop(columns=['Sales', 'Date'])
y = train['Sales']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), params, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


  train = pd.read_csv("D:/Personal/Kifiya 10 Academy/Week 4/data/Data/rossmann-store-sales/train.csv")
Traceback (most recent call last):
  File "c:\Users\Almazt\AppData\Local\anaconda3\envs\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "c:\Users\Almazt\AppData\Local\anaconda3\envs\myenv\lib\site-packages\sklearn\metrics\_scorer.py", line 472, in __call__
    return estimator.score(*args, **kwargs)
  File "c:\Users\Almazt\AppData\Local\anaconda3\envs\myenv\lib\site-packages\sklearn\base.py", line 572, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "c:\Users\Almazt\AppData\Local\anaconda3\envs\myenv\lib\site-packages\sklearn\ensemble\_forest.py", line 904, in predict
    proba = self.predict_proba(X)
  File "c:\Users\Almazt\AppData\Local\anaconda3\envs\myenv\lib\site-packages\sklearn\ensemble\_forest.py", line 952, in predict_proba
    

KeyboardInterrupt: 

In [17]:
# Random Search:
# Randomly samples parameter combinations for a fixed number of iterations.

from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(RandomForestClassifier(), params, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)


Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 20}


In [18]:
# Bayesian Optimization:
# •	Models the performance function and selects the best parameters iteratively.
# •	Use frameworks like Optuna or Hyperopt.

# %pip install optuna
import optuna

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 10, 50)
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    score = cross_val_score(model, X_train, y_train, cv=3).mean()
    
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best Parameters:", study.best_params)


[I 2025-01-02 11:39:12,769] A new study created in memory with name: no-name-a10477c7-8cbd-4428-8ae3-a0fef1daf9d1
[I 2025-01-02 11:39:13,029] Trial 0 finished with value: 0.9500000000000001 and parameters: {'n_estimators': 62, 'max_depth': 10}. Best is trial 0 with value: 0.9500000000000001.
[I 2025-01-02 11:39:13,685] Trial 1 finished with value: 0.9416666666666668 and parameters: {'n_estimators': 173, 'max_depth': 29}. Best is trial 0 with value: 0.9500000000000001.
[I 2025-01-02 11:39:14,429] Trial 2 finished with value: 0.9500000000000001 and parameters: {'n_estimators': 189, 'max_depth': 21}. Best is trial 0 with value: 0.9500000000000001.
[I 2025-01-02 11:39:14,779] Trial 3 finished with value: 0.9583333333333334 and parameters: {'n_estimators': 77, 'max_depth': 14}. Best is trial 3 with value: 0.9583333333333334.
[I 2025-01-02 11:39:15,254] Trial 4 finished with value: 0.9500000000000001 and parameters: {'n_estimators': 121, 'max_depth': 21}. Best is trial 3 with value: 0.958333

Best Parameters: {'n_estimators': 77, 'max_depth': 14}


In [19]:
# Types of Cross-Validation:
# 1.	K-Fold Cross-Validation:
# o	Splits data into k subsets and rotates testing folds.

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5)
print("Mean Accuracy:", scores.mean())

# 2.	Stratified K-Fold:
# o	Ensures class distribution is preserved in folds (useful for imbalanced datasets).

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    
# 3.	Time Series Split:
# o	Use for time-series data to preserve the temporal order.

from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    print("Train:", train_index, "Test:", test_index)
    
# 4.	Leave-One-Out Cross-Validation (LOOCV):
# o	Uses all data points except one for training.

from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
for train_index, test_index in loo.split(X):
    print("Train:", train_index, "Test:", test_index)


Mean Accuracy: 0.96
Train: [ 10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27
  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45
  46  47  48  49  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149] Test: [  0   1   2   3   4   5   6   7   8   9  50  51  52  53  54  55  56  57
  58  59 100 101 102 103 104 105 106 107 108 109]
Train: [  0   1   2   3   4   5   6   7   8   9  20  21  22  23  24  25  26  27
  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45
  46  47  48  49  50  51  52  53  54  55  56  57  58  59  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 105 