## Chapter 3 - Exploring Classical Machine Learning
This chapter covers regression and classification techniques using Scikit-learn, demonstrating how classical models like linear regression and decision trees are built, interpreted, and evaluated.

**Note:** Run the following cell to define constants related to datasets

In [None]:
# Base GitHub repository URL
BASE_URL = "https://opensourceai-book.github.io/code/datasets/"

# Dataset file names
POWERS_FILE = "superheroes_powers.csv"
INFO_POWERS_FILE  = "superheroes_info_powers.csv"
INFO_POWERS2_FILE = "superheroes_info_powers2.csv"
PLOTS_FILE = "superheroes_story_plots.csv"

# Construct full dataset URLs
SUPERHEROES_POWERS_URL = f"{BASE_URL}{POWERS_FILE}"
SUPERHEROES_INFO_POWERS_URL = f"{BASE_URL}{INFO_POWERS_FILE}"
SUPERHEROES_INFO_POWERS2_URL = f"{BASE_URL}{INFO_POWERS2_FILE}"
SUPERHEROES_INFO_PLOTS_URL = f"{BASE_URL}{PLOTS_FILE}"

### Listing 3-1: Linear Regression of Superhero Height and Weight
Performs linear regression to predict height from weight, using filtered superhero data and evaluating with the coefficient, intercept, and mean squared error. Then Plots the actual versus predicted heights, visualizing the linear regression modelâ€™s performance and highlighting how well the model fits the superhero data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv(SUPERHEROES_INFO_POWERS_URL)

# Step 1: Data Preprocessing
# Filter for "Human" and "Cyborg" races
df = df[df['Race'].isin(['Human', 'Cyborg'])]

# Remove blank (NaN) values for 'Height' and 'Weight'
df = df.dropna(subset=['Height', 'Weight'])

# Simple outlier removal: Remove extreme values for 'Height' and 'Weight'
df = df[(df['Weight'] >= 30) & (df['Weight'] <= 400)]
df = df[(df['Height'] >= 100) & (df['Height'] <= 350)]

# Select columns for regression
X = df[['Weight']].values  # Independent variable (Weight)
y = df['Height'].values    # Dependent variable (Height)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 2: Build and Train the Model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 3: Make Predictions and Evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Coefficient: {model.coef_[0]}")
print(f"Intercept: {model.intercept_}")
print(f"Mean Squared Error: {mse:.2f}")

Run the following code cell to plot the results

In [None]:
# Step 4: Plot the Results
plt.scatter(X_test, y_test, color='blue', label='Actual Heights')
plt.plot(X_test, y_pred, color='red', label='Predicted Heights')
plt.xlabel("Weight")
plt.ylabel("Height")
plt.title("Linear Regression: Height vs. Weight (Human & Cyborg)")
plt.legend()
plt.show()

### Listing 3-2: Training Classification Models for Superhero Races
Trains logistic regression and decision tree models to classify superheroes into races (Human, Mutant, Cyborg) using powers data, then outputs accuracy and performance metrics. Then Visualizes the decision tree using a simplified, color-coded representation to help understand how the model makes predictions based on superhero attributes.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the superhero powers dataset
df_powers = pd.read_csv(SUPERHEROES_POWERS_URL)

# Load the additional superhero info for race data
info_df = pd.read_csv(SUPERHEROES_INFO_POWERS_URL)

# Merge powers data with race information
df = df_powers.merge(info_df[['name', 'Race']],
                     left_on='hero_names', right_on='name')

# Filter to only the most common races: Human, Mutant, Cyborg
df = df[df['Race'].isin(['Human', 'Mutant', 'Cyborg'])]
df = df.dropna()  # Drop any rows with missing data

# Convert Race to numerical values
df['Race'] = df['Race'].map({'Human': 0, 'Mutant': 1, 'Cyborg': 2})

# Select all powers as features
X = df.drop(columns=['hero_names', 'name', 'Race']).astype(int).values
y = df['Race'].values  # Labels: Human (0), Mutant (1), Cyborg (2)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Logistic Regression Model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
log_predictions = log_model.predict(X_test)
log_accuracy = accuracy_score(y_test, log_predictions)
log_report = classification_report(y_test, log_predictions)

# Decision Tree Model
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_predictions = tree_model.predict(X_test)
tree_accuracy = accuracy_score(y_test, tree_predictions)
tree_report = classification_report(y_test, tree_predictions)

# Output results
print("Logistic Regression Accuracy:", log_accuracy)
print("Logistic Regression Report:\n", log_report)
print("Decision Tree Accuracy:", tree_accuracy)
print("Decision Tree Report:\n", tree_report)

Run the following code cell to plot the decision tree

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Colors for the races: Blue for Human, Red for Mutant, Yellow for Cyborg
colors = ['blue', 'red', 'yellow']

# Plot the decision tree using colors only
plt.figure(figsize=(12, 8))
ax = plt.gca()

# Recursive function to plot nodes
def plot_node(node, depth, x, width):
    if tree_model.tree_.children_left[node] != tree_model.tree_.children_right[node]:
        # Calculate positions of child nodes
        left_x = x - width / 2
        right_x = x + width / 2
        next_width = width / 2

        # Plot lines to child nodes
        ax.plot([x, left_x], [depth, depth + 1], 'k-')
        ax.plot([x, right_x], [depth, depth + 1], 'k-')

        # Recursively plot child nodes
        plot_node(tree_model.tree_.children_left[node], depth + 1,
                  left_x, next_width)
        plot_node(tree_model.tree_.children_right[node], depth + 1,
                  right_x, next_width)
    else:
        # Plot a colored square for leaf nodes
        predicted_class = tree_model.tree_.value[node].argmax()
        ax.add_patch(plt.Rectangle((x - 0.1, depth - 0.1), 0.2, 0.2,
                                   color=colors[predicted_class]))

# Start plotting from the root node
plot_node(1, 2, 0.5, 1.0)

# Remove axes and add legend
ax.axis('off')
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in
                  zip(colors, ['Human', 'Mutant', 'Cyborg'])]
plt.legend(handles=legend_patches, loc='upper right')
plt.title("Simplified Decision Tree Visualization")
plt.gca().invert_yaxis()  # Invert the y-axis so the root is at the top
plt.show()

### Listing 3-3: Dimensionality Reduction with PCA Power Score
This code uses PCA for dimensionality reduction, enhancing the superheroes_info_powers dataset with a more nuanced Power Score, offering a comprehensive measure of superheroes' abilities.

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the datasets
df_info_powers = pd.read_csv(SUPERHEROES_INFO_POWERS_URL)
df_powers = pd.read_csv(SUPERHEROES_POWERS_URL)

# Merge datasets on hero names
df = df_info_powers.merge(df_powers, left_on='name', right_on='hero_names')

# Drop columns that are not power-related
non_power_cols = ['name', 'Gender', 'Race', 'Height', 'Publisher',
                  'Alignment', 'Weight', 'OPR', 'SDR', 'hero_names']
X = df.drop(columns=non_power_cols).astype(int)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce to one principal component
pca = PCA(n_components=1)
pca_power_score = pca.fit_transform(X_scaled)

# Round PCA power scores to one decimal point
pca_power_score = np.round(pca_power_score, 1)

# Add the PCA Power Score to the merged dataset
df['PCA_Power_Score'] = pca_power_score

# Match back only relevant columns to original info_powers dataset
df_info_powers = df_info_powers.merge(df[['name', 'PCA_Power_Score']],
                                      on='name', how='left')

# Save updated dataset to a new CSV file
# df_info_powers.to_csv(INFO_POWERS2_FILE, index=False)

# Preview the new dataset's features
print(df_info_powers)

### Listing 3-4: K-Means Clustering of Superheroes by PCA Power Score and Alignment
Applies K-Means Clustering on PCA Power Score and encoded Alignment to group superheroes, adding variation for clearer visualization of clusters using Matplotlib.

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load the updated dataset with PCA Power Score
df_info_powers = pd.read_csv(SUPERHEROES_INFO_POWERS2_URL)

# Encode the 'Alignment' feature (0 for bad, 1 for good, 0.5 for neutral)
alignment_map = {'bad': 0, 'good': 1, 'neutral': 0.5}
df_info_powers['Alignment_Encoded'] = df_info_powers['Alignment'].map(
    alignment_map
)

# Select PCA Power Score and Alignment (encoded)
df_cluster = df_info_powers[['PCA_Power_Score', 'Alignment_Encoded']].dropna()

# Apply K-Means Clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
df_cluster['Cluster'] = kmeans.fit_predict(df_cluster)

# Add small random variation to 'Alignment_Encoded' values
variation = np.random.uniform(-0.1, 0.1, size=df_cluster.shape[0])
df_cluster['Alignment_Variation'] = df_cluster['Alignment_Encoded'] + variation

# Plot the clusters with adjusted scale and custom colors
plt.figure(figsize=(8, 6))
scatter = plt.scatter(
    df_cluster['PCA_Power_Score'], df_cluster['Alignment_Variation'],
    c=df_cluster['Cluster'], cmap='coolwarm', alpha=0.6
)
plt.xlabel('PCA Power Score')
plt.ylabel('Alignment')
plt.title('Clustering Using PCA Power Score with Alignment')
plt.colorbar(scatter, label='Cluster', ticks=[0, 0.5, 1])
plt.yticks([0, 0.5, 1], ['Bad', 'Neutral', 'Good'])
plt.show()

### Listing 3-5: Cosine Similarity Calculation for Superheroes Relationships
Calculates the cosine similarity between superheroes, identifying the most and least similar pairs based on their powers. The results highlight relationships between characters.

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the info_powers2 dataset
df_info_powers = pd.read_csv(SUPERHEROES_INFO_POWERS2_URL)

# Select relevant features for cosine similarity calculation
features = ['PCA_Power_Score', 'OPR', 'SDR']
df_filtered = df_info_powers.dropna(subset=features)

# Extract features and corresponding hero names
X = df_filtered[features].values
hero_names = df_filtered['name'].values

# Calculate the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(X)

# Create a DataFrame for the similarity matrix
similarities = pd.DataFrame(cosine_sim_matrix, index=hero_names,
                            columns=hero_names)

# Flatten the matrix and remove self-comparisons
flattened = similarities.reset_index().melt(id_vars='index')
flattened = flattened[flattened['index'] != flattened['variable']]

# Find the most and least similar pairs
most_similar = flattened.loc[flattened['value'].idxmax()]
least_similar = flattened.loc[flattened['value'].idxmin()]

# Most and Least Similar Heroes
print("Most Similar Heroes:")
print(f"Hero 1: {most_similar['index']}")
print(f"Hero 2: {most_similar['variable']}")
print(f"Similarity Score: {most_similar['value']}\n")

print("Least Similar Heroes:")
print(f"Hero 1: {least_similar['index']}")
print(f"Hero 2: {least_similar['variable']}")
print(f"Similarity Score: {least_similar['value']}")

### Listing 3-6: Training Step 1: Loading and Encoding
Combines multiple traits into a single textual representation, summarizing each superhero's profile and encodes gender from text to numerical values, prepping for comprehensive analysis of superhero profiles during training.

In [None]:
# Load the required libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv(SUPERHEROES_INFO_POWERS2_URL)

# Drop rows with missing Gender and prepare textual features
df = df.dropna(subset=['Gender'])
df['Description'] = (
    "Name: " + df['name'].fillna("Unknown") + ", " +
    "Race: " + df['Race'].fillna("Unknown") + ", " +
    "Height: " + df['Height'].fillna(0).astype(str) + ", " +
    "Weight: " + df['Weight'].fillna(0).astype(str) + ", " +
    "Offense Boost: " + df['OPR'].fillna(0).astype(str) + ", " +
    "Defense Boost: " + df['SDR'].fillna(0).astype(str) + ", " +
    "PCA Power Score: " + df['PCA_Power_Score'].fillna(0).astype(str) + ", " +
    "Alignment: " + df['Alignment'].fillna("Unknown")
)

# Encode Gender into numerical format
le_gender = LabelEncoder()
df['GenderEncoded'] = le_gender.fit_transform(df['Gender'])

# Display the first few rows of the dataset
df[['Description', 'Gender', 'GenderEncoded']].head(10)

### Listing 3-7: Training Step 2 - Testing the Baseline Accuracy
Calculates baseline accuracy by predicting the most frequent class in the training data, providing a benchmark for evaluating model performance.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Transform textual features into numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['Description'])

# Encode the Gender field into numerical format (e.g., Male -> 0, Female -> 1)
le_gender = LabelEncoder()
y = le_gender.fit_transform(df['Gender'])

# Split the dataset into 70% training and 30% testing subsets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Create a baseline prediction using the most frequent class from training data
baseline_pred = [y_train[0]] * len(y_test)

# Calculate the baseline accuracy by comparing predictions to actual test labels
baseline_accuracy = accuracy_score(y_test, baseline_pred)
print(f"Baseline Accuracy (Before Training): {baseline_accuracy:.2f}")

### Listing 3-8: Step 3 - Training a Naive Bayes Model
Trains a Naive Bayes classifier using TF-IDF features, evaluates predictions on test data, and reports the model's accuracy. It ensures effective text classification.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_score
from sklearn.metrics import recall_score, f1_score

# Train model with baseline TF-IDF
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Test the model
train_pred = nb_model.predict(X_test)

# Calculate Accuracy
train_accuracy = accuracy_score(y_test, train_pred)

# Calculate Precision, Recall, and F1-Score
precision = precision_score(y_test, train_pred, average='weighted')
recall = recall_score(y_test, train_pred, average='weighted')
f1 = f1_score(y_test, train_pred, average='weighted')

# Print Metrics
print(f"Accuracy After Training: {train_accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# Print Classification Report
print("\nClassification Report:")
print(classification_report(y_test, train_pred, target_names=le_gender.classes_))

### Listing 3-9: Step 4 - Fine Tuning
Refines TF-IDF parameters, preprocesses descriptions, and retrains the Naive Bayes model for improved performance.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# Preprocess Description to remove structural terms
df['Processed_Description'] = df['Description'].replace(
    r"(Name:|Race:|Height:|Weight:|Offense Boost:|Defense Boost:|"
    r"PCA Power Score:|Alignment:)", "", regex=True
).str.strip()

# Fine-Tune TF-IDF to improve feature extraction
tfidf_vectorizer_tuned = TfidfVectorizer(
    ngram_range=(1, 1),  # Focus only on unigrams for structured data
    min_df=1,            # Capture rare but potentially meaningful terms
    max_df=0.8,          # Exclude overly common terms
    stop_words=None,     # Do not remove any terms explicitly
    max_features=200     # Limit to the top 1000 most informative features
)

# Transform the cleaned descriptions
X_tuned = tfidf_vectorizer_tuned.fit_transform(df['Processed_Description'])

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train_tuned, X_test_tuned, y_train, y_test = train_test_split(
    X_tuned, y, test_size=0.3, random_state=42
)

# Train a Naive Bayes model on the fine-tuned TF-IDF features
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train_tuned, y_train)

# Predict using the fine-tuned model
fine_tuned_pred = nb_model.predict(X_test_tuned)

# Calculate and print key metrics
fine_tuned_accuracy = accuracy_score(y_test, fine_tuned_pred)
print(f"Accuracy After Fine-Tuning: {fine_tuned_accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, fine_tuned_pred, target_names=le_gender.classes_))


### Listing 3-10: Step 5 - Visualizing a Confusion Matrix for Model Evaluation
Generates and visualizes the confusion matrix, showing the breakdown of correct and incorrect predictions for each class in the dataset.

In [None]:
# Generate and display confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm_tuned = confusion_matrix(y_test, fine_tuned_pred)
ConfusionMatrixDisplay(
    confusion_matrix=cm_tuned,
    display_labels=le_gender.classes_
).plot(cmap="Blues")

# Show the plot
import matplotlib.pyplot as plt
plt.title("Confusion Matrix: Fine-Tuned TF-IDF Model")
plt.show()

### Listing 3-11: SHAP Explanation for Random Forest
SHAP explains predictions of superhero gender from a random forest classifier trained on structured features.

In [None]:
# Import required libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import shap

# Load dataset and clean missing values
df = pd.read_csv(SUPERHEROES_INFO_POWERS2_URL)
df = df.dropna(subset=['Gender', 'Height', 'Weight', 'PCA_Power_Score', 'OPR',
                       'SDR'])

# Select features and encode target variable
X = df[['Height', 'Weight', 'PCA_Power_Score', 'OPR', 'SDR']]
y = df['Gender'].map({'Male': 1, 'Female': 0})

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=42)

# Train Random Forest classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Use SHAP to explain predictions
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Visualize SHAP summary plot for the Male class
shap.summary_plot(shap_values[1], X_test, feature_names=X.columns)

# Display sample SHAP contributions with feature values
features_df = pd.DataFrame(X_test, columns=X.columns)
shap_values_mean = shap_values[1].mean(axis=1)  # Aggregate SHAP values per row
shap_values_df = pd.DataFrame(shap_values[1], columns=X.columns)
summary_df = pd.concat([features_df, shap_values_df.add_prefix('SHAP_')], axis=1)

# Print the first 10 rows for inspection
print("\nSample Data with SHAP Contributions:\n")
print(summary_df.head(10))