In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
import altair as alt

# Function to convert damage string to integer
def convert_damage(damage):
    """
    Convert a damage string of the format 'XdY+Z' to an average damage value.
    
    Parameters:
    damage (str): Damage string (e.g., '2d6+3')

    Returns:
    float: Average damage value
    """
    match = re.match(r'(\d+)d(\d+)(?:\+(\d+))?', damage)
    if match:
        num_dice = int(match.group(1))
        dice_size = int(match.group(2))
        bonus = int(match.group(3)) if match.group(3) else 0
        avg_damage = num_dice * (dice_size + 1) / 2 + bonus
        return avg_damage
    return np.nan

# Load and preprocess data
data = pd.read_csv('static/data/monsters_data.csv')
data = data.drop(['Timestamp'], axis=1)
data = data.dropna()
data['Rarity'] = data['Rarity'].astype('category').cat.codes
data['Damage'] = data['Damage'].apply(convert_damage)

# Ensure no negative values in the Damage column
data['Damage'] = data['Damage'].clip(lower=0)  # Clip negative values to zero

# Scale Damage using MinMaxScaler
scaler = MinMaxScaler()
data['Damage'] = scaler.fit_transform(data[['Damage']])

# Convert scaled Damage to integers
data['Damage'] = (data['Damage'] * 100).round().astype(int)

# Define target and features
target = 'Rarity'
X = data[['Health', 'Energy', 'Sanity', 'Damage']]
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

X_train_updated = X_train_poly
X_test_updated = X_test_poly

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define parameter grids for tuning
xgboost_param_grid = {
    'xgboost__n_estimators': [100, 200, 300],
    'xgboost__max_depth': [3, 5, 7],
    'xgboost__learning_rate': [0.01, 0.1, 0.3],
    'xgboost__subsample': [0.7, 0.8, 0.9],
    'xgboost__colsample_bytree': [0.7, 0.8, 0.9],
    'xgboost__reg_alpha': [0.1, 1.0],
    'xgboost__reg_lambda': [1.0, 2.0]
}

rf_param_grid = {
    'randomforest__n_estimators': [100, 200, 300, 400],
    'randomforest__max_depth': [3, 5, 7, 10],
    'randomforest__min_samples_split': [2, 5, 10],
    'randomforest__min_samples_leaf': [1, 2, 4]
}

# Create pipelines for models
pipelines = {
    'xgboost': Pipeline([
        ('minmaxscaler', MinMaxScaler()),
        ('xgboost', xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss'))
    ]),
    'randomforest': Pipeline([
        ('minmaxscaler', MinMaxScaler()),
        ('randomforest', RandomForestClassifier(class_weight=class_weight_dict))
    ])
}

# Initialize results list
results = []
best_estimators = {}

# Perform grid search for XGBoost
print("Training XGBoost...")
grid_search_xgboost = GridSearchCV(pipelines['xgboost'], xgboost_param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search_xgboost.fit(X_train_updated, y_train)
best_estimators['xgboost'] = grid_search_xgboost.best_estimator_

# Evaluate XGBoost
y_pred = best_estimators['xgboost'].predict(X_test_updated)
accuracy = best_estimators['xgboost'].score(X_test_updated, y_test)
report = classification_report(y_test, y_pred)
results.append(('XGBoost', accuracy, report))

# Perform grid search for RandomForest
print("Training RandomForest...")
grid_search_rf = GridSearchCV(pipelines['randomforest'], rf_param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search_rf.fit(X_train_updated, y_train)
best_estimators['randomforest'] = grid_search_rf.best_estimator_

# Evaluate RandomForest
y_pred = best_estimators['randomforest'].predict(X_test_updated)
accuracy = best_estimators['randomforest'].score(X_test_updated, y_test)
report = classification_report(y_test, y_pred)
results.append(('RandomForest', accuracy, report))

# Define models for stacking classifier
estimators_for_stacking = [
    ('xgboost', best_estimators['xgboost']),
    ('randomforest', best_estimators['randomforest'])
]

# Check if there are at least two models in the stacking classifier
if len(estimators_for_stacking) >= 2:
    print("Training Stacking Classifier...")
    try:
        stacking_clf = StackingClassifier(
            estimators=estimators_for_stacking,
            final_estimator=LogisticRegression()
        )
        
        # Train Stacking Classifier
        stacking_clf.fit(X_train_updated, y_train)
        y_pred = stacking_clf.predict(X_test_updated)
        accuracy = stacking_clf.score(X_test_updated, y_test)
        report = classification_report(y_test, y_pred)
        results.append(('Stacking Classifier', accuracy, report))
    except Exception as e:
        print("Error training Stacking Classifier:", e)
else:
    print("Not enough models to train Stacking Classifier")

# Print results
for name, accuracy, report in results:
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(report)
    print("--------------------------------------------------")

# Print a sample of the Damage column to verify no negative values
print(data['Damage'].sample(10))


Training XGBoost...
Training RandomForest...
Training Stacking Classifier...
Model: XGBoost
Accuracy: 0.74
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       266
           1       0.77      0.78      0.77       189
           2       0.62      0.59      0.61       146
           3       0.53      0.55      0.54        95
           4       0.62      0.57      0.60        79
           5       0.50      0.32      0.39        25

    accuracy                           0.74       800
   macro avg       0.66      0.63      0.64       800
weighted avg       0.73      0.74      0.73       800

--------------------------------------------------
Model: RandomForest
Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       266
           1       0.77      0.80      0.79       189
           2       0.68      0.63      0.65       146
 

In [2]:
chart = alt.Chart(data).mark_bar().encode(
    x=alt.X('Damage:Q', bin=True),
    y='count()',
    color='Rarity:N'
).properties(
    title='Distribution of Damage and Rarity'
)

chart.show()

In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
import xgboost as xgb

# Load and preprocess data
data = pd.read_csv('static/data/monsters_data.csv')
data = data.drop(['Timestamp'], axis=1)
data = data.dropna()
data['Rarity'] = data['Rarity'].astype('category').cat.codes

# Define target and features
target = 'Rarity'
X = data[['Health', 'Energy', 'Sanity']]
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define parameter grids for tuning
xgboost_param_grid = {
    'xgboost__n_estimators': [100, 200, 300],
    'xgboost__max_depth': [3, 5, 7],
    'xgboost__learning_rate': [0.01, 0.1, 0.3],
    'xgboost__subsample': [0.7, 0.8, 0.9],
    'xgboost__colsample_bytree': [0.7, 0.8, 0.9],
    'xgboost__reg_alpha': [0.1, 1.0],
    'xgboost__reg_lambda': [1.0, 2.0]
}

rf_param_grid = {
    'randomforest__n_estimators': [100, 200, 300, 400],
    'randomforest__max_depth': [3, 5, 7, 10],
    'randomforest__min_samples_split': [2, 5, 10],
    'randomforest__min_samples_leaf': [1, 2, 4]
}

# Create pipelines for models
pipelines = {
    'xgboost': Pipeline([
        ('minmaxscaler', MinMaxScaler()),
        ('xgboost', xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss'))
    ]),
    'randomforest': Pipeline([
        ('minmaxscaler', MinMaxScaler()),
        ('randomforest', RandomForestClassifier(class_weight=class_weight_dict))
    ])
}

# Initialize results list
results = []
best_estimators = {}

# Perform grid search for XGBoost
print("Training XGBoost...")
grid_search_xgboost = GridSearchCV(pipelines['xgboost'], xgboost_param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search_xgboost.fit(X_train_poly, y_train)
best_estimators['xgboost'] = grid_search_xgboost.best_estimator_

# Evaluate XGBoost
y_pred = best_estimators['xgboost'].predict(X_test_poly)
accuracy = best_estimators['xgboost'].score(X_test_poly, y_test)
report = classification_report(y_test, y_pred)
results.append(('XGBoost', accuracy, report))

# Perform grid search for RandomForest
print("Training RandomForest...")
grid_search_rf = GridSearchCV(pipelines['randomforest'], rf_param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search_rf.fit(X_train_poly, y_train)
best_estimators['randomforest'] = grid_search_rf.best_estimator_

# Evaluate RandomForest
y_pred = best_estimators['randomforest'].predict(X_test_poly)
accuracy = best_estimators['randomforest'].score(X_test_poly, y_test)
report = classification_report(y_test, y_pred)
results.append(('RandomForest', accuracy, report))

# Define models for stacking classifier
estimators_for_stacking = [
    ('xgboost', best_estimators['xgboost']),
    ('randomforest', best_estimators['randomforest'])
]

# Check if there are at least two models in the stacking classifier
if len(estimators_for_stacking) >= 2:
    print("Training Stacking Classifier...")
    try:
        stacking_clf = StackingClassifier(
            estimators=estimators_for_stacking,
            final_estimator=LogisticRegression()
        )
        
        # Train Stacking Classifier
        stacking_clf.fit(X_train_poly, y_train)
        y_pred = stacking_clf.predict(X_test_poly)
        accuracy = stacking_clf.score(X_test_poly, y_test)
        report = classification_report(y_test, y_pred)
        results.append(('Stacking Classifier', accuracy, report))
    except Exception as e:
        print("Error training Stacking Classifier:", e)
else:
    print("Not enough models to train Stacking Classifier")

# Print results
for name, accuracy, report in results:
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(report)
    print("--------------------------------------------------")


Training XGBoost...
Training RandomForest...
Training Stacking Classifier...
Model: XGBoost
Accuracy: 0.62
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.89      0.84       266
           1       0.58      0.57      0.58       189
           2       0.49      0.47      0.48       146
           3       0.43      0.42      0.43        95
           4       0.59      0.44      0.51        79
           5       0.44      0.32      0.37        25

    accuracy                           0.62       800
   macro avg       0.55      0.52      0.53       800
weighted avg       0.61      0.62      0.61       800

--------------------------------------------------
Model: RandomForest
Accuracy: 0.61
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       266
           1       0.55      0.50      0.52       189
           2       0.51      0.54      0.53       146
 

In [3]:
data.head()

Unnamed: 0,Name,Type,Level,Rarity,Damage,Health,Energy,Sanity
0,Imp,Demonic,8,0,8d2+3,16.77,15.9,16.34
1,Sapphire Drake,Dragon,10,1,10d4+1,41.4,39.79,40.18
2,Ice Elemental,Elemental,4,4,4d10,43.19,38.07,39.98
3,Quasit,Demonic,18,1,18d4+2,72.72,72.95,71.42
4,Wyvern,Dragon,7,3,7d8+1,56.7,58.64,54.01


In [6]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

# Function to convert damage string to integer
def convert_damage(damage):
    match = re.match(r'(\d+)d(\d+)(?:\+(\d+))?', damage)
    if match:
        num_dice = int(match.group(1))
        dice_size = int(match.group(2))
        bonus = int(match.group(3)) if match.group(3) else 0
        avg_damage = num_dice * (dice_size + 1) / 2 + bonus
        return avg_damage
    return np.nan

# Load and preprocess data
data = pd.read_csv('static/data/monsters_data.csv')
data = data.drop(['Timestamp'], axis=1)  # Remove timestamp column
data = data.dropna()
data['Rarity'] = data['Rarity'].astype('category').cat.codes
data['Damage'] = data['Damage'].apply(convert_damage)

# Ensure no negative values in the Damage column
data['Damage'] = data['Damage'].clip(lower=0)  # Clip negative values to zero

# Scale Damage using MinMaxScaler
scaler = MinMaxScaler()
data['Damage'] = scaler.fit_transform(data[['Damage']])

# Define target and features
target = 'Rarity'
X = data[['Health', 'Energy', 'Sanity', 'Damage']]
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

X_train_updated = X_train_poly
X_test_updated = X_test_poly

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define parameter grids for tuning
xgboost_param_grid = {
    'xgboost__n_estimators': [100, 200, 300],
    'xgboost__max_depth': [3, 5, 7],
    'xgboost__learning_rate': [0.01, 0.1, 0.3],
    'xgboost__subsample': [0.7, 0.8, 0.9],
    'xgboost__colsample_bytree': [0.7, 0.8, 0.9],
    'xgboost__reg_alpha': [0.1, 1.0],
    'xgboost__reg_lambda': [1.0, 2.0]
}

rf_param_grid = {
    'randomforest__n_estimators': [100, 200, 300, 400],
    'randomforest__max_depth': [3, 5, 7, 10],
    'randomforest__min_samples_split': [2, 5, 10],
    'randomforest__min_samples_leaf': [1, 2, 4]
}

# Create pipelines for models
pipelines = {
    'xgboost': Pipeline([
        ('minmaxscaler', MinMaxScaler()),
        ('xgboost', xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss'))
    ]),
    'randomforest': Pipeline([
        ('minmaxscaler', MinMaxScaler()),
        ('randomforest', RandomForestClassifier(class_weight=class_weight_dict))
    ])
}

# Initialize results list
results = []
best_estimators = {}

# Perform grid search for XGBoost
print("Training XGBoost...")
grid_search_xgboost = GridSearchCV(pipelines['xgboost'], xgboost_param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search_xgboost.fit(X_train_updated, y_train)
best_estimators['xgboost'] = grid_search_xgboost.best_estimator_

# Evaluate XGBoost
y_pred = best_estimators['xgboost'].predict(X_test_updated)
accuracy = best_estimators['xgboost'].score(X_test_updated, y_test)
report = classification_report(y_test, y_pred)
results.append(('XGBoost', accuracy, report))

# Perform grid search for RandomForest
print("Training RandomForest...")
grid_search_rf = GridSearchCV(pipelines['randomforest'], rf_param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search_rf.fit(X_train_updated, y_train)
best_estimators['randomforest'] = grid_search_rf.best_estimator_

# Evaluate RandomForest
y_pred = best_estimators['randomforest'].predict(X_test_updated)
accuracy = best_estimators['randomforest'].score(X_test_updated, y_test)
report = classification_report(y_test, y_pred)
results.append(('RandomForest', accuracy, report))

# Define models for stacking classifier
estimators_for_stacking = [
    ('xgboost', best_estimators['xgboost']),
    ('randomforest', best_estimators['randomforest'])
]

# Check if there are at least two models in the stacking classifier
if len(estimators_for_stacking) >= 2:
    print("Training Stacking Classifier...")
    try:
        stacking_clf = StackingClassifier(
            estimators=estimators_for_stacking,
            final_estimator=LogisticRegression()
        )
        
        # Train Stacking Classifier
        stacking_clf.fit(X_train_updated, y_train)
        y_pred = stacking_clf.predict(X_test_updated)
        accuracy = stacking_clf.score(X_test_updated, y_test)
        report = classification_report(y_test, y_pred)
        results.append(('Stacking Classifier', accuracy, report))
    except Exception as e:
        print("Error training Stacking Classifier:", e)
else:
    print("Not enough models to train Stacking Classifier")

# Print results
for name, accuracy, report in results:
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(report)
    print("--------------------------------------------------")

# Print a sample of the Damage column to verify no negative values
print(data['Damage'].sample(10))

Training XGBoost...
Training RandomForest...
Training Stacking Classifier...
Model: XGBoost
Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       266
           1       0.81      0.84      0.83       189
           2       0.68      0.65      0.67       146
           3       0.63      0.66      0.65        95
           4       0.70      0.56      0.62        79
           5       0.48      0.40      0.43        25

    accuracy                           0.79       800
   macro avg       0.70      0.68      0.69       800
weighted avg       0.78      0.79      0.78       800

--------------------------------------------------
Model: RandomForest
Accuracy: 0.76
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94       266
           1       0.79      0.78      0.78       189
           2       0.65      0.64      0.65       146
 

In [124]:
'''
Project Summary:
In this project, I developed a comprehensive machine learning workflow, encompassing data preprocessing, model training, tuning, and API integration. The preprocessing steps involved cleaning, scaling, and feature engineering to prepare the dataset. I trained and tuned three models: XGBoost, Random Forest, and a Stacking Classifier, utilizing grid search for hyperparameter optimization. The final deliverable included a Machine Learning Interface Class, encapsulating the model, handling data input, making predictions, and supporting model serialization through joblib. Additionally, the class includes an info method, facilitating seamless integration with an API. This robust approach meets all project requirements and ensures a reliable and efficient machine learning solution ready for deployment.

Best Model:
The Stacking Classifier, with an accuracy of 0.79, emerged as the top-performing model. This model combines predictions from two strong classifiers, XGBoost and Random Forest, using a Logistic Regression model as the final estimator to boost overall performance. By leveraging the complementary strengths of XGBoost and Random Forest, the Stacking Classifier captures complex patterns within the data, significantly improving predictive accuracy. The model demonstrated particularly strong performance in predicting class 0 (the most common class), with high precision and recall, making it well-suited for this problem. Overall, the Stacking Classifier outperformed the other models tested in this project, offering reliable and accurate predictions.
    '''

'\nProject Summary:\n    In this project, I developed a comprehensive machine learning workflow, including model training, tuning, and integration into an API. Initially, I focused on preprocessing the data, which involved cleaning, scaling, and feature engineering. I trained and tuned three models: XGBoost, Random Forest, and a Stacking Classifier, using grid search for hyperparameter tuning. The Stacking Classifier model achieved the best performance with an accuracy of 0.79. I then created a Machine Learning Interface Class that encapsulates the model, handles data, makes predictions, and supports model serialization using joblib. Additionally, the class provides an info method to integrate the model with an API. This comprehensive approach ensures a robust and efficient machine learning solution, meeting all the project requirements and allowing for seamless integration and deployment.\n    \n    Best Model:\n    '