In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# %% [markdown]
# # ðŸŒŒ Dark Theme Machine Learning Visualizations
# *20+ Seaborn plots with dark theme for EDA and model analysis*

# %% [code]
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_regression, load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

# %% [code]
# Set dark theme globally
plt.style.use('dark_background')
sns.set_style("darkgrid", {
    'axes.facecolor': '#1e1e1e',
    'figure.facecolor': '#121212',
    'grid.color': '#2d2d2d',
    'axes.edgecolor': '#444444',
    'axes.labelcolor': '#e0e0e0',
    'xtick.color': '#cfcfcf',
    'ytick.color': '#cfcfcf',
    'text.color': '#e6e6e6'
})

# Set global font size
plt.rcParams.update({'font.size': 12})

# %% [code]
# Generate synthetic datasets
# Classification dataset
X_clf, y_clf = make_classification(
    n_samples=1000, n_features=4, n_informative=3, n_redundant=1,
    n_clusters_per_class=1, random_state=42
)
clf_df = pd.DataFrame(X_clf, columns=[f'Feature_{i}' for i in range(4)])
clf_df['Target'] = y_clf
clf_df['Target'] = clf_df['Target'].map({0: 'Class A', 1: 'Class B'})

# Regression dataset
X_reg, y_reg = make_regression(
    n_samples=500, n_features=3, noise=15, random_state=42
)
reg_df = pd.DataFrame(X_reg, columns=[f'Feature_{i}' for i in range(3)])
reg_df['Target'] = y_reg

# Time series data
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', periods=100, freq='D')
ts_df = pd.DataFrame({
    'Date': dates,
    'Value': np.cumsum(np.random.randn(100)) + 50,
    'Category': np.random.choice(['A', 'B', 'C'], 100)
})

# Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['Species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# %% [markdown]
# ## 1. Scatter Plot with Hue
# Visualize feature relationships colored by class

# %% [code]
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=clf_df, 
    x='Feature_0', 
    y='Feature_1',
    hue='Target',
    palette='viridis',
    alpha=0.8,
    edgecolor='#333333',
    s=75
)
plt.title('Feature Relationship by Class', fontsize=16, pad=20)
plt.legend(title='Target Class', facecolor='#2d2d2d')
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 2. Distribution Plot (Hist + KDE)
# Compare feature distributions between classes

# %% [code]
plt.figure(figsize=(10, 6))
sns.histplot(
    data=clf_df, 
    x='Feature_2', 
    hue='Target',
    kde=True,
    bins=30,
    palette='rocket',
    alpha=0.6,
    edgecolor='#333333'
)
plt.title('Feature Distribution with Density Estimate', fontsize=16, pad=20)
plt.legend(title='Class', facecolor='#2d2d2d')
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 3. Box Plot with Outliers
# Analyze feature distribution across categories

# %% [code]
plt.figure(figsize=(12, 7))
sns.boxplot(
    data=iris_df,
    x='Species',
    y='petal length (cm)',
    palette='mako',
    width=0.6,
    flierprops=dict(marker='o', markersize=6, markerfacecolor='#ff6b6b')
)
plt.title('Petal Length Distribution by Species', fontsize=16, pad=20)
plt.ylabel('Petal Length (cm)', labelpad=10)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 4. Violin Plot with Split Hue
# Compare distributions with density estimation

# %% [code]
plt.figure(figsize=(12, 7))
sns.violinplot(
    data=clf_df,
    x='Target',
    y='Feature_3',
    hue='Target',
    split=True,
    inner='quartiles',
    palette='icefire',
    width=0.8
)
plt.title('Feature Distribution with Density Estimation', fontsize=16, pad=20)
plt.legend([],[], frameon=False)  # Remove duplicate legend
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 5. Heatmap (Correlation Matrix)
# Identify feature relationships

# %% [code]
plt.figure(figsize=(10, 8))
corr = iris_df.drop('Species', axis=1).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(
    corr,
    mask=mask,
    annot=True,
    fmt=".2f",
    cmap='coolwarm',
    center=0,
    square=True,
    cbar_kws={'shrink': 0.8},
    annot_kws={'color': '#ffffff'}
)
plt.title('Feature Correlation Matrix', fontsize=16, pad=20)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 6. Pair Plot with Regression
# Multivariate relationships with regression lines

# %% [code]
sns.pairplot(
    iris_df,
    hue='Species',
    palette='viridis',
    diag_kind='kde',
    plot_kws={'alpha': 0.7, 'edgecolor': '#333333'},
    diag_kws={'fill': True, 'alpha': 0.3},
    height=2.5
)
plt.suptitle('Feature Relationships by Species', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 7. Count Plot
# Class distribution visualization

# %% [code]
plt.figure(figsize=(10, 6))
ax = sns.countplot(
    data=clf_df,
    x='Target',
    palette='crest',
    edgecolor='#444444'
)
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', 
                xytext=(0, 5), 
                textcoords='offset points',
                color='#e0e0e0')
plt.title('Class Distribution', fontsize=16, pad=20)
plt.ylabel('Count', labelpad=10)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 8. Regression Plot
# Feature-target relationship with confidence interval

# %% [code]
plt.figure(figsize=(10, 6))
sns.regplot(
    data=reg_df,
    x='Feature_0',
    y='Target',
    color='#64b5f6',
    scatter_kws={'alpha': 0.6, 'edgecolor': '#333333', 's': 60},
    line_kws={'color': '#ff7043', 'lw': 2.5},
    ci=95
)
plt.title('Feature vs Target with Confidence Interval', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 9. Residual Plot
# Model error analysis

# %% [code]
# Train simple model for residuals
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(reg_df[['Feature_0']], reg_df['Target'])
reg_df['Predictions'] = model.predict(reg_df[['Feature_0']])
reg_df['Residuals'] = reg_df['Target'] - reg_df['Predictions']

plt.figure(figsize=(10, 6))
sns.residplot(
    data=reg_df,
    x='Predictions',
    y='Residuals',
    lowess=True,
    line_kws={'color': '#ff7043', 'lw': 2},
    scatter_kws={'alpha': 0.7, 'edgecolor': '#444444'}
)
plt.axhline(y=0, color='#e0e0e0', linestyle='--', alpha=0.5)
plt.title('Residual Analysis', fontsize=16, pad=20)
plt.xlabel('Predicted Values', labelpad=10)
plt.ylabel('Residuals', labelpad=10)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 10. Confusion Matrix
# Classification performance visualization

# %% [code]
# Generate predictions
X_train, X_test, y_train, y_test = train_test_split(
    X_clf, y_clf, test_size=0.3, random_state=42
)
rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, 
                     index=['Actual A', 'Actual B'],
                     columns=['Predicted A', 'Predicted B'])

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm_df,
    annot=True,
    fmt='d',
    cmap='Blues',
    cbar=False,
    annot_kws={'size': 14, 'color': '#ffffff'},
    linewidths=0.5,
    linecolor='#2d2d2d'
)
plt.title('Confusion Matrix', fontsize=16, pad=20)
plt.xlabel('Predicted Label', labelpad=10)
plt.ylabel('True Label', labelpad=10)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 11. ROC Curve
# Classifier performance across thresholds

# %% [code]
y_prob = rf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='#7e57c2', lw=2.5, 
         label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='#e0e0e0', lw=1.5, linestyle='--', alpha=0.7)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', labelpad=10)
plt.ylabel('True Positive Rate', labelpad=10)
plt.title('Receiver Operating Characteristic', fontsize=16, pad=20)
plt.legend(loc="lower right", facecolor='#2d2d2d')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 12. Cluster Map
# Hierarchical clustering of correlations

# %% [code]
plt.figure(figsize=(12, 10))
sns.clustermap(
    corr,
    cmap='coolwarm',
    center=0,
    annot=True,
    fmt=".2f",
    annot_kws={'size': 10, 'color': '#ffffff'},
    figsize=(10, 8),
    cbar_pos=(0.02, 0.8, 0.05, 0.18)
)
plt.suptitle('Hierarchical Clustered Correlation Matrix', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 13. Swarm + Box Plot
# Detailed distribution with individual points

# %% [code]
plt.figure(figsize=(12, 7))
sns.boxplot(
    data=iris_df,
    x='Species',
    y='sepal width (cm)',
    color='#2d2d2d',
    width=0.4,
    showfliers=False,
    zorder=1
)
sns.swarmplot(
    data=iris_df,
    x='Species',
    y='sepal width (cm)',
    size=5,
    edgecolor='#333333',
    linewidth=0.5,
    palette='viridis',
    zorder=2
)
plt.title('Sepal Width Distribution with Individual Points', fontsize=16, pad=20)
plt.ylabel('Sepal Width (cm)', labelpad=10)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 14. Point Plot with Error Bars
# Mean estimates with confidence intervals

# %% [code]
plt.figure(figsize=(10, 6))
sns.pointplot(
    data=clf_df,
    x='Target',
    y='Feature_1',
    ci=95,
    join=True,
    color='#ff9800',
    markers='D',
    scale=1.2,
    errwidth=1.5,
    capsize=0.1
)
plt.title('Mean Feature Value with 95% CI', fontsize=16, pad=20)
plt.ylabel('Mean Feature_1 Value', labelpad=10)
plt.grid(alpha=0.2)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 15. Joint Distribution Plot
# Bivariate relationship with marginal distributions

# %% [code]
g = sns.JointGrid(
    data=reg_df,
    x='Feature_1',
    y='Target',
    height=8,
    ratio=4,
    space=0.2
)
g.plot_joint(
    sns.scatterplot, 
    alpha=0.6, 
    edgecolor='#333333',
    palette='viridis',
    size=40
)
g.plot_marginals(
    sns.histplot, 
    kde=True, 
    color='#4fc3f7', 
    alpha=0.7,
    edgecolor='#333333'
)
g.ax_joint.set_title('Joint Distribution of Feature and Target', fontsize=16, pad=20)
g.ax_joint.grid(alpha=0.2)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 16. Time Series with Shaded Area
# Temporal trends with uncertainty bands

# %% [code]
# Create rolling statistics
ts_df['Rolling_Mean'] = ts_df['Value'].rolling(window=7).mean()
ts_df['Rolling_Std'] = ts_df['Value'].rolling(window=7).std()

plt.figure(figsize=(14, 7))
plt.plot(ts_df['Date'], ts_df['Value'], 
         color='#29b6f6', alpha=0.7, linewidth=2, label='Daily Value')
plt.plot(ts_df['Date'], ts_df['Rolling_Mean'], 
         color='#ff7043', linewidth=3, label='7-Day Rolling Mean')
plt.fill_between(
    ts_df['Date'],
    ts_df['Rolling_Mean'] - ts_df['Rolling_Std'],
    ts_df['Rolling_Mean'] + ts_df['Rolling_Std'],
    color='#5c6bc0',
    alpha=0.3,
    label='Â±1 Std Dev'
)
plt.title('Time Series with Rolling Statistics', fontsize=16, pad=20)
plt.xlabel('Date', labelpad=10)
plt.ylabel('Value', labelpad=10)
plt.legend(facecolor='#2d2d2d')
plt.grid(alpha=0.2)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 17. Categorical Scatter Plot
# Relationship between categories and numerical feature

# %% [code]
plt.figure(figsize=(12, 7))
sns.stripplot(
    data=ts_df,
    x='Category',
    y='Value',
    hue='Category',
    palette='deep',
    size=6,
    alpha=0.8,
    jitter=True,
    edgecolor='#333333',
    linewidth=0.5
)
plt.title('Value Distribution by Category', fontsize=16, pad=20)
plt.ylabel('Value', labelpad=10)
plt.legend(title='Category', facecolor='#2d2d2d')
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 18. KDE Plot with Multiple Distributions
# Smooth density estimation for multiple groups

# %% [code]
plt.figure(figsize=(10, 6))
sns.kdeplot(
    data=iris_df,
    x='sepal length (cm)',
    hue='Species',
    fill=True,
    common_norm=False,
    palette='viridis',
    alpha=0.6,
    linewidth=2,
    edgecolor='#333333'
)
plt.title('Sepal Length Density by Species', fontsize=16, pad=20)
plt.xlabel('Sepal Length (cm)', labelpad=10)
plt.legend(title='Species', facecolor='#2d2d2d')
plt.grid(alpha=0.2)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 19. Bar Plot with Error Bars
# Group comparisons with confidence intervals

# %% [code]
# Generate grouped data
grouped = iris_df.groupby('Species').agg(
    mean=('petal length (cm)', 'mean'),
    std=('petal length (cm)', 'std')
).reset_index()

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    data=grouped,
    x='Species',
    y='mean',
    yerr=grouped['std'],
    capsize=0.15,
    color='#4fc3f7',
    edgecolor='#333333',
    alpha=0.8
)
plt.title('Mean Petal Length by Species (Â±1 Std Dev)', fontsize=16, pad=20)
plt.ylabel('Mean Petal Length (cm)', labelpad=10)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 20. Pair Grid with Custom Plots
# Advanced multi-plot visualization

# %% [code]
g = sns.PairGrid(
    iris_df,
    hue='Species',
    palette='viridis',
    diag_sharey=False,
    height=2.5,
    aspect=1.1
)
g.map_upper(sns.scatterplot, alpha=0.7, edgecolor='#333333')
g.map_lower(sns.kdeplot, fill=True, alpha=0.5, common_norm=False)
g.map_diag(sns.histplot, kde=True, alpha=0.6, edgecolor='#333333')
g.add_legend(title='Species', facecolor='#2d2d2d')
g.fig.suptitle('Custom Pair Grid Visualization', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 21. Radar Chart (Spider Plot)
# Multivariate comparison across features

# %% [code]
# Prepare data for radar chart
features = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
radar_df = clf_df.groupby('Target')[features].mean().reset_index()
radar_df = pd.melt(radar_df, id_vars='Target', var_name='Feature', value_name='Mean')

# Create radar chart
categories = features
N = len(categories)

angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]

plt.figure(figsize=(10, 8))

ax = plt.subplot(111, polar=True)
plt.xticks(angles[:-1], categories, color='#e0e0e0', size=12)

ax.plot(np.zeros_like(angles), color='#2d2d2d', linewidth=1, linestyle='solid')
ax.fill(np.zeros_like(angles), '#2d2d2d', alpha=0.1)

for target in radar_df['Target'].unique():
    values = radar_df[radar_df['Target'] == target]['Mean'].values.flatten().tolist()
    values += values[:1]
    ax.plot(angles, values, linewidth=2.5, linestyle='solid', 
            label=target, marker='o', markersize=8)
    ax.fill(angles, values, alpha=0.15)

plt.title('Mean Feature Values by Class', fontsize=16, pad=30)
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), facecolor='#2d2d2d')
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 22. Gradient Boosting Feature Importance
# Model interpretability visualization

# %% [code]
# Train model and get feature importances
rf.fit(X_clf, y_clf)
importances = pd.Series(rf.feature_importances_, index=[f'Feature_{i}' for i in range(4)])
importances = importances.sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(
    x=importances.values,
    y=importances.index,
    palette='rocket',
    edgecolor='#333333',
    orient='h'
)
plt.title('Random Forest Feature Importance', fontsize=16, pad=20)
plt.xlabel('Importance Score', labelpad=10)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# %% [code]
# (Only if continuing from fresh session â€” otherwise skip)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_breast_cancer, load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_curve, classification_report
import warnings
warnings.filterwarnings('ignore')

plt.style.use('dark_background')
sns.set_style("darkgrid", {
    'axes.facecolor': '#1e1e1e',
    'figure.facecolor': '#121212',
    'grid.color': '#2d2d2d',
    'axes.edgecolor': '#444444',
    'axes.labelcolor': '#e0e0e0',
    'xtick.color': '#cfcfcf',
    'ytick.color': '#cfcfcf',
    'text.color': '#e6e6e6'
})
plt.rcParams.update({'font.size': 12})

In [None]:
# Load breast cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
y_scores = rf.predict_proba(X)[:, 1]

precision, recall, _ = precision_recall_curve(y, y_scores)
plt.figure(figsize=(10, 6))
plt.plot(recall, precision, color='#4db6ac', linewidth=2.5, label='Precision-Recall Curve')
plt.xlabel('Recall', labelpad=10)
plt.ylabel('Precision', labelpad=10)
plt.title('Precision-Recall Curve', fontsize=16, pad=20)
plt.grid(alpha=0.3)
plt.legend(facecolor='#2d2d2d')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, val_scores = learning_curve(
    RandomForestClassifier(), X, y, cv=5, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10), random_state=42
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, 'o-', color='#64b5f6', label='Training Score')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='#64b5f6')
plt.plot(train_sizes, val_mean, 'o-', color='#ff7043', label='Validation Score')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.2, color='#ff7043')
plt.xlabel('Training Set Size', labelpad=10)
plt.ylabel('Accuracy', labelpad=10)
plt.title('Learning Curve', fontsize=16, pad=20)
plt.legend(facecolor='#2d2d2d')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import validation_curve

param_range = np.arange(10, 201, 20)
train_scores, val_scores = validation_curve(
    RandomForestClassifier(), X, y, param_name='n_estimators',
    param_range=param_range, cv=5, n_jobs=-1
)

train_mean = np.mean(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(param_range, train_mean, 'o-', color='#64b5f6', label='Training Score')
plt.plot(param_range, val_mean, 'o-', color='#ff7043', label='Validation Score')
plt.xlabel('Number of Estimators', labelpad=10)
plt.ylabel('Accuracy', labelpad=10)
plt.title('Validation Curve: n_estimators', fontsize=16, pad=20)
plt.legend(facecolor='#2d2d2d')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
def ecdf(data):
    x = np.sort(data)
    y = np.arange(1, len(data)+1) / len(data)
    return x, y

x_ecdf, y_ecdf = ecdf(reg_df['Target'])
plt.figure(figsize=(10, 6))
plt.plot(x_ecdf, y_ecdf, marker='.', linestyle='none', color='#4fc3f7', alpha=0.8, markersize=4)
plt.xlabel('Target Value', labelpad=10)
plt.ylabel('Cumulative Probability', labelpad=10)
plt.title('Empirical Cumulative Distribution Function (ECDF)', fontsize=16, pad=20)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.hexbin(reg_df['Feature_0'], reg_df['Target'], gridsize=30, cmap='viridis', bins='log')
plt.colorbar(label='log10(N)')
plt.xlabel('Feature_0', labelpad=10)
plt.ylabel('Target', labelpad=10)
plt.title('2D Density with Hexbin', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

In [None]:
from matplotlib import cm

# Prepare data
iris_melt = iris_df.melt(id_vars='Species', var_name='Measurement', value_name='Value')
g = sns.FacetGrid(iris_melt, row='Measurement', hue='Species', aspect=5, height=1.2, palette='viridis')
g.map(sns.kdeplot, 'Value', fill=True, alpha=0.7, clip_on=False)
g.map(plt.axhline, y=0, lw=2, clip_on=False)
g.set_titles('')
g.set(yticks=[], ylabel='')
g.despine(bottom=True)
for ax, title in zip(g.axes.flat, iris_melt['Measurement'].unique()):
    ax.text(0.02, 0.8, title, transform=ax.transAxes, fontweight='bold', color='#e0e0e0')
plt.suptitle('Ridgeline Plot: Feature Distributions by Species', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
np.random.seed(42)
bubble_df = pd.DataFrame({
    'x': np.random.randn(50),
    'y': np.random.randn(50),
    'size': np.random.randint(100, 2000, 50),
    'color': np.random.rand(50)
})

plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    bubble_df['x'], bubble_df['y'],
    s=bubble_df['size'],
    c=bubble_df['color'],
    alpha=0.6,
    cmap='plasma',
    edgecolors='#333333',
    linewidth=0.5
)
plt.colorbar(scatter, label='Color Intensity')
plt.xlabel('X Position', labelpad=10)
plt.ylabel('Y Position', labelpad=10)
plt.title('Bubble Plot with Variable Size & Color', fontsize=16, pad=20)
plt.grid(alpha=0.2)
plt.tight_layout()
plt.show()

In [None]:
from pandas.plotting import parallel_coordinates

wine = load_wine()
wine_df = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_df['Target'] = wine.target
wine_df['Target'] = wine_df['Target'].map({0: 'Class 0', 1: 'Class 1', 2: 'Class 2'})

# Sample and normalize
sample = wine_df.sample(100, random_state=42)
features = ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium']
norm_sample = sample[features + ['Target']].copy()
for col in features:
    norm_sample[col] = (norm_sample[col] - norm_sample[col].min()) / (norm_sample[col].max() - norm_sample[col].min())

plt.figure(figsize=(12, 7))
parallel_coordinates(norm_sample, 'Target', color=['#64b5f6', '#ff7043', '#4db6ac'], linewidth=1.2, alpha=0.7)
plt.title('Parallel Coordinates Plot (Normalized)', fontsize=16, pad=20)
plt.ylabel('Normalized Value', labelpad=10)
plt.xticks(rotation=30)
plt.grid(alpha=0.2)
plt.tight_layout()
plt.show()

In [None]:
from pandas.plotting import andrews_curves

plt.figure(figsize=(12, 7))
andrews_curves(norm_sample, 'Target', color=['#64b5f6', '#ff7043', '#4db6ac'], alpha=0.7)
plt.title('Andrews Curves for Class Separation', fontsize=16, pad=20)
plt.grid(alpha=0.2)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.calibration import calibration_curve

fraction_of_positives, mean_predicted_value = calibration_curve(y, y_scores, n_bins=10)

plt.figure(figsize=(10, 6))
plt.plot(mean_predicted_value, fraction_of_positives, "s-", color='#4db6ac', label='Random Forest')
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
plt.xlabel('Mean Predicted Probability', labelpad=10)
plt.ylabel('Fraction of Positives', labelpad=10)
plt.title('Calibration Curve', fontsize=16, pad=20)
plt.legend(facecolor='#2d2d2d')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.inspection import PartialDependenceDisplay

features = [0, 1]  # First two features
plt.figure(figsize=(10, 6))
PartialDependenceDisplay.from_estimator(
    rf, X, features, 
    grid_resolution=50,
    ax=plt.gca(),
    line_kw={'color': '#ff7043', 'linewidth': 2.5}
)
plt.title('Partial Dependence Plot', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(rf, X, y, n_repeats=10, random_state=42)
perm_df = pd.DataFrame({
    'feature': data.feature_names,
    'importance': perm_importance.importances_mean
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(
    data=perm_df,
    x='importance',
    y='feature',
    palette='rocket_r',
    orient='h'
)
plt.title('Permutation Feature Importance (Top 10)', fontsize=16, pad=20)
plt.xlabel('Mean Importance Decrease', labelpad=10)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Simulate SHAP values using feature importances scaled by sign randomness
np.random.seed(42)
shap_values = np.random.randn(100, 4) * np.array(rf.feature_importances_[:4])

plt.figure(figsize=(10, 6))
for i in range(4):
    plt.scatter([i] * 100, shap_values[:, i], 
                c=shap_values[:, i], 
                cmap='coolwarm', 
                alpha=0.7, 
                s=30,
                edgecolor='#333333',
                linewidth=0.3)
plt.axhline(0, color='#e0e0e0', linestyle='--', alpha=0.5)
plt.xticks(range(4), [f'Feature_{i}' for i in range(4)], rotation=30)
plt.ylabel('Impact on Prediction', labelpad=10)
plt.title('Simulated SHAP Summary Plot', fontsize=16, pad=20)
plt.colorbar(label='SHAP Value')
plt.tight_layout()
plt.show()

In [None]:
from scipy import stats

plt.figure(figsize=(8, 8))
stats.probplot(reg_df['Target'], dist="norm", plot=plt)
plt.title('Q-Q Plot: Target vs Normal Distribution', fontsize=16, pad=20)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Sort by predicted probability
df_gain = pd.DataFrame({'y_true': y, 'y_score': y_scores})
df_gain = df_gain.sort_values('y_score', ascending=False).reset_index(drop=True)
df_gain['cumulative_positive'] = df_gain['y_true'].cumsum()
df_gain['percentile'] = np.arange(1, len(df_gain) + 1) / len(df_gain)

plt.figure(figsize=(10, 6))
plt.plot(df_gain['percentile'], df_gain['cumulative_positive'] / df_gain['cumulative_positive'].max(), 
         color='#4db6ac', linewidth=2.5, label='Model')
plt.plot([0, 1], [0, 1], 'k--', alpha=0.7, label='Random')
plt.xlabel('Percentage of Population', labelpad=10)
plt.ylabel('Cumulative % of Positive Cases', labelpad=10)
plt.title('Cumulative Gain Chart', fontsize=16, pad=20)
plt.legend(facecolor='#2d2d2d')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Calculate lift
df_gain['lift'] = (df_gain['cumulative_positive'] / df_gain.index) / (df_gain['y_true'].mean())

plt.figure(figsize=(10, 6))
plt.plot(df_gain['percentile'], df_gain['lift'], color='#ff7043', linewidth=2.5)
plt.xlabel('Percentage of Population', labelpad=10)
plt.ylabel('Lift', labelpad=10)
plt.title('Lift Chart', fontsize=16, pad=20)
plt.axhline(y=1, color='#e0e0e0', linestyle='--', alpha=0.7)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
cv_scores = cross_val_score(rf, X, y, cv=10)
plt.figure(figsize=(10, 6))
sns.histplot(cv_scores, kde=True, bins=8, color='#64b5f6', alpha=0.7, edgecolor='#333333')
plt.axvline(cv_scores.mean(), color='#ff7043', linestyle='--', linewidth=2, label=f'Mean: {cv_scores.mean():.3f}')
plt.xlabel('CV Accuracy Score', labelpad=10)
plt.ylabel('Frequency', labelpad=10)
plt.title('Cross-Validation Score Distribution', fontsize=16, pad=20)
plt.legend(facecolor='#2d2d2d')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
cv_scores = cross_val_score(rf, X, y, cv=10)
plt.figure(figsize=(10, 6))
sns.histplot(cv_scores, kde=True, bins=8, color='#64b5f6', alpha=0.7, edgecolor='#333333')
plt.axvline(cv_scores.mean(), color='#ff7043', linestyle='--', linewidth=2, label=f'Mean: {cv_scores.mean():.3f}')
plt.xlabel('CV Accuracy Score', labelpad=10)
plt.ylabel('Frequency', labelpad=10)
plt.title('Cross-Validation Score Distribution', fontsize=16, pad=20)
plt.legend(facecolor='#2d2d2d')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

X_clust = iris_df.drop('Species', axis=1)
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X_clust)

silhouette_avg = silhouette_score(X_clust, cluster_labels)
sample_silhouette_values = silhouette_samples(X_clust, cluster_labels)

plt.figure(figsize=(10, 7))
y_lower = 10
for i in range(3):
    ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = sns.color_palette("viridis", 3)[i]
    plt.fill_betweenx(np.arange(y_lower, y_upper),
                      0, ith_cluster_silhouette_values,
                      facecolor=color, edgecolor=color, alpha=0.7)
    plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10

plt.axvline(x=silhouette_avg, color="#ff7043", linestyle="--", label=f'Avg Silhouette: {silhouette_avg:.3f}')
plt.xlabel('Silhouette Coefficient Values', labelpad=10)
plt.ylabel('Cluster Label', labelpad=10)
plt.title('Silhouette Analysis for K=3', fontsize=16, pad=20)
plt.legend(facecolor='#2d2d2d')
plt.tight_layout()
plt.show()

In [None]:
# Simulate p-values for features
np.random.seed(42)
pvals = 10 ** (-np.random.uniform(0, 5, len(data.feature_names)))
neg_log_pvals = -np.log10(pvals)
manhattan_df = pd.DataFrame({
    'feature': data.feature_names,
    'pval': neg_log_pvals,
    'chrom': np.random.choice(['Chr1', 'Chr2', 'Chr3'], len(pvals))
})

plt.figure(figsize=(14, 6))
colors = ['#64b5f6', '#ff7043', '#4db6ac']
for i, chrom in enumerate(manhattan_df['chrom'].unique()):
    idx = manhattan_df['chrom'] == chrom
    plt.scatter(np.where(idx)[0], manhattan_df.loc[idx, 'pval'], 
                color=colors[i], label=chrom, alpha=0.8, s=40)

plt.axhline(-np.log10(0.05), color='#ff5252', linestyle='--', label='Significance Threshold (0.05)')
plt.xlabel('Features', labelpad=10)
plt.ylabel('-log10(p-value)', labelpad=10)
plt.title('Manhattan Plot (Simulated Feature Significance)', fontsize=16, pad=20)
plt.legend(facecolor='#2d2d2d')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Simulate base value and contributions
base = 0.5
contributions = np.random.randn(5) * 0.1
features = [f'Feature_{i}' for i in range(5)]
values = [base] + list(np.cumsum(contributions) + base)
changes = [0] + list(contributions)

plt.figure(figsize=(12, 7))
colors = ['#64b5f6' if x > 0 else '#ff7043' for x in changes]

for i in range(1, len(values)):
    plt.bar(i-1, changes[i], bottom=values[i-1], 
            color=colors[i], edgecolor='#333333', width=0.6)
    plt.text(i-1, values[i-1] + changes[i]/2, f'{changes[i]:+.2f}', 
             ha='center', va='center', color='white', fontweight='bold')

plt.plot(range(len(values)), values, 'o-', color='#e0e0e0', markersize=8)
plt.axhline(base, color='#aaaaaa', linestyle='--', alpha=0.5)
plt.xticks(range(len(features)), features, rotation=45)
plt.ylabel('Prediction Value', labelpad=10)
plt.title('Waterfall Chart: Prediction Breakdown', fontsize=16, pad=20)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()