In [8]:
!pip install kneed
print('Successfully Imported')

Defaulting to user installation because normal site-packages is not writeable
Collecting kneed
  Downloading kneed-0.8.5-py3-none-any.whl.metadata (5.5 kB)
Downloading kneed-0.8.5-py3-none-any.whl (10 kB)
Installing collected packages: kneed
Successfully installed kneed-0.8.5
Successfully Imported


In [31]:
import pandas as pd
import seaborn as sns

from wordcloud import WordCloud
import matplotlib.pyplot as plt

from kneed import KneeLocator
from matplotlib.ticker import MaxNLocator

In [4]:
#Made with ChatGPT
def plot_two_countplots(
    df1,
    df2,
    col="Rating",
    labels=("Test Classes", "Nontest Classes"),
    figsize=(12, 5)
):
    """
    Plots side-by-side countplots of `col` for df1 and df2.

    Parameters
    ----------
    df1, df2 : pandas.DataFrame
        The two dataframes to plot.
    col : str, default "mycol"
        Name of the column (with two possible values) to count.
    labels : tuple of str, default ("DataFrame 1", "DataFrame 2")
        Titles to use for the left and right plots.
    figsize : tuple, default (12, 5)
        Size of the overall figure (width, height).
    """
    fig, axes = plt.subplots(1, 2, figsize=figsize)

    # Left plot
    sns.countplot(x=col, data=df1, ax=axes[0])
    axes[0].set_title(labels[0])
    axes[0].set_xlabel(col)
    axes[0].set_ylabel("Count")

    # Right plot
    sns.countplot(x=col, data=df2, ax=axes[1])
    axes[1].set_title(labels[1])
    axes[1].set_xlabel(col)
    axes[1].set_ylabel("Count")

    plt.tight_layout()
    plt.show()


In [13]:
def plot_scree(evr_source, title="Scree Plot", figsize=(8, 5)):
    """
    Plot a scree plot with cumulative variance and automatic elbow detection using kneed.

    Parameters
    ----------
    evr_source : array-like or sklearn.decomposition.PCA
        - If PCA: uses its .explained_variance_ratio_.
        - If 1D array-like: treated as the explained_variance_ratio_ directly.
        - If 2D array-like: treated as PC scores (n_samples x n_components),
          and explained_variance_ratio_ is computed as var(scores, axis=0) / sum(var).
    title : str
        Plot title.
    figsize : tuple
        Figure size (inches).

    Returns
    -------
    elbow_pc : int
        1-based index of the detected elbow component.
    elbow_cum_var : float
        Cumulative explained-variance ratio at that elbow.
    """
    # 1) Extract or compute explained_variance_ratio_
    if isinstance(evr_source, PCA):
        evr = np.array(evr_source.explained_variance_ratio_)
    else:
        arr = np.asarray(evr_source)
        if arr.ndim == 1:
            evr = arr
        elif arr.ndim == 2:
            var = np.var(arr, axis=0, ddof=0)
            evr = var / np.sum(var)
        else:
            raise ValueError(
                "evr_source must be PCA, 1D explained-variance array, or 2D PC-scores array"
            )

    n = len(evr)
    x = np.arange(1, n + 1)

    # 2) Elbow detection using kneed
    kneedle = KneeLocator(x, evr, curve="convex", direction="decreasing")
    elbow_idx = kneedle.knee
    if elbow_idx is None:
        raise RuntimeError("KneeLocator failed to detect an elbow")

    # 3) Compute cumulative variance
    cum_evr = np.cumsum(evr)
    elbow_cum_var = cum_evr[int(elbow_idx) - 1]

    # 4) Plot
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot(x, evr, marker='o', linestyle='-', label="Explained Variance")
    ax.plot(x, cum_evr, marker='s', linestyle='--', label="Cumulative Variance")
    ax.xaxis.set_major_locator(MaxNLocator(integer=True, nbins=20))
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Variance Ratio")
    ax.set_title(title)
    ax.grid(alpha=0.3)
    ax.set_ylim(0, 1.05)

    # 5) Annotate elbow cumulative variance with offset to avoid overlap
    offset = (0, 20) if elbow_cum_var < 0.5 else (0, -40)
    ax.annotate(
        f"Elbow: PC {elbow_idx}\n{elbow_cum_var:.2%} cumulative",
        xy=(elbow_idx, elbow_cum_var),
        xytext=offset,
        textcoords='offset points',
        ha='center',
        va='bottom' if offset[1] > 0 else 'top',
        arrowprops=dict(arrowstyle='->', lw=1)
    )

    # 6) Final touches
    ax.legend(loc='best')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    return elbow_idx, elbow_cum_var


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def biplot(orig, pcs, feature_names=None, pc_indices=(0, 1), top_arrows=10,
           arrow_scale=None, figsize=(8, 6), title=None, predictions=None):
    """
    Create a PCA biplot showing samples in PC space and arrows for original features.

    Parameters
    ----------
    orig : array-like or DataFrame, shape (n_samples, n_features)
        The original features (e.g., token counts or standardized variables).
    pcs : array-like or DataFrame, shape (n_samples, n_components)
        The principal component scores for each sample.
    feature_names : list of str, length n_features, optional
        Names of the original features. If None and orig is a DataFrame, uses orig.columns.
    pc_indices : tuple(int, int), default (0, 1)
        Indices of the two principal components to plot (0-based).
    top_arrows : int, default 10
        Number of features (arrows) to display by descending loading magnitude.
    arrow_scale : float or None, default None
        Scaling factor for loadings arrows. If None, computed automatically.
    figsize : tuple, default (8, 6)
        Figure size.
    title : str, optional
        Title of the plot.
    predictions : array-like, shape (n_samples,), optional
        Class labels or continuous values for coloring points.

    Returns
    -------
    fig, ax : matplotlib Figure and Axes
        The figure and axes objects containing the biplot.
    """
    # Convert inputs to numpy arrays
    X = orig.values if hasattr(orig, 'values') else np.asarray(orig)
    Z = pcs.values if hasattr(pcs, 'values') else np.asarray(pcs)

    # Select PCs
    pc_x, pc_y = pc_indices
    scores = Z[:, [pc_x, pc_y]]

    # Feature names
    if feature_names is None:
        if hasattr(orig, 'columns'):
            feature_names = list(orig.columns)
        else:
            feature_names = [f"Var{i}" for i in range(X.shape[1])]

    # Center original features
    Xc = X - np.mean(X, axis=0)

    # Compute loadings: covariance between features and PC scores
    loadings = (Xc.T @ scores) / (Xc.shape[0] - 1)

    # Compute magnitude of loadings for the two PCs
    magnitude = np.sqrt(loadings[:, 0]**2 + loadings[:, 1]**2)

    # Select top features by magnitude
    top_idx = np.argsort(magnitude)[-top_arrows:]

    # Auto-scale arrows
    if arrow_scale is None:
        range_scores = np.max(scores, axis=0) - np.min(scores, axis=0)
        scale_factor = 0.8 * np.max(range_scores) / np.max(magnitude)
    else:
        scale_factor = arrow_scale

    # Create plot
    fig, ax = plt.subplots(figsize=figsize)

    # Plot points with seaborn for hue support
    if predictions is not None:
        sns.scatterplot(x=scores[:, 0], y=scores[:, 1], hue=predictions,
                        palette='deep', ax=ax, alpha=0.7, legend='full')
    else:
        ax.scatter(scores[:, 0], scores[:, 1], alpha=0.7)

    # Draw arrows for top features
    for i in top_idx:
        ax.arrow(0, 0,
                 loadings[i, 0] * scale_factor,
                 loadings[i, 1] * scale_factor,
                 head_width=0.02 * np.max(range_scores),
                 head_length=0.02 * np.max(range_scores),
                 length_includes_head=True, color='r')
        ax.text(loadings[i, 0] * scale_factor * 1.05,
                loadings[i, 1] * scale_factor * 1.05,
                feature_names[i], color='r', ha='center', va='center')

    ax.set_xlabel(f"PC{pc_x + 1}")
    ax.set_ylabel(f"PC{pc_y + 1}")
    if title:
        ax.set_title(title)

    ax.axhline(0, color='grey', linewidth=0.8)
    ax.axvline(0, color='grey', linewidth=0.8)
    plt.tight_layout()

    return fig, ax


In [None]:
from matplotlib import colors as mcolors

def wordcloud_pc(orig, pcs, feature_names=None, pc_index=0,
                 max_words=100, width=800, height=400,
                 scale=1, colormap_positive='Reds', colormap_negative='Blues',
                 background_color='white', title=None):
    """
    Generate and display a wordcloud for a specified principal component.

    Word sizes are proportional to the magnitude of each feature's loading on the PC,
    and colors indicate the loading direction (sign).
    """
    # Convert to numpy
    X = orig.values if hasattr(orig, 'values') else np.asarray(orig)
    Z = pcs.values if hasattr(pcs, 'values') else np.asarray(pcs)

    # Features
    if feature_names is None:
        if hasattr(orig, 'columns'):
            feature_names = list(orig.columns)
        else:
            feature_names = [f"Var{i}" for i in range(X.shape[1])]

    # Center original data
    Xc = X - np.mean(X, axis=0)

    # Compute loadings for the component
    scores_pc = Z[:, pc_index]
    loadings = (Xc.T @ scores_pc) / (Xc.shape[0] - 1)

    # Build frequency dictionary: magnitude -> word size
    freq = {feature_names[i]: abs(loadings[i]) for i in range(len(loadings))}
    # Limit to top N
    top = dict(sorted(freq.items(), key=lambda x: x[1], reverse=True)[:max_words])

    # Color function based on sign, returning hex strings
    min_mag = np.min(np.abs(loadings))
    max_mag = np.max(np.abs(loadings))
    
    def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
        
        val = loadings[feature_names.index(word)]
        return 'darkred' if val <= 0 else 'darkgreen'


    wc = WordCloud(width=width, height=height, scale=scale,
                   background_color=background_color,
                   prefer_horizontal=1.0,
                   color_func=color_func)
    wc.generate_from_frequencies(top)

    # Plot
    fig, ax = plt.subplots(figsize=(width/100, height/100))
    ax.imshow(wc, interpolation='bilinear')
    ax.axis('off')
    if title:
        ax.set_title(title)
    plt.tight_layout()

    return wc, fig, ax


In [None]:
def plot_pc_loadings_final(orig, pcs, feature_names=None, pc_index=0,
                          top_n=20, horizontal=True,
                          pos_color='#2ca02c', neg_color='#d62728',
                          title=None, figsize=(10, 8)):
    """
    Final bulletproof version:
    - One solid color per bar
    - Length = absolute loading magnitude
    - Clear value annotations
    """
    # Convert to arrays
    X = np.asarray(orig)
    Z = np.asarray(pcs)
    
    # Get feature names
    if feature_names is None:
        if hasattr(orig, 'columns'):
            feature_names = orig.columns.tolist()
        else:
            feature_names = [f"Feature {i}" for i in range(X.shape[1])]

    # Compute loadings (covariance method)
    X_centered = X - np.mean(X, axis=0)
    loadings = (X_centered.T @ Z[:, pc_index]) / (X.shape[0] - 1)

    # Sort features by absolute loading magnitude
    sorted_indices = np.argsort(np.abs(loadings))[::-1][:top_n]
    sorted_names = [feature_names[i] for i in sorted_indices]
    sorted_loadings = loadings[sorted_indices]
    abs_loadings = np.abs(sorted_loadings)

    # Create plot
    fig, ax = plt.subplots(figsize=figsize)

    # Plot horizontal bars
    if horizontal:
        y_pos = np.arange(len(sorted_names))
        colors = [pos_color if val >= 0 else neg_color for val in sorted_loadings]

        # Plot bars
        bars = ax.barh(y_pos, abs_loadings, color=colors, height=0.8)

        # Annotate values
        for i, (val, abs_val) in enumerate(zip(sorted_loadings, abs_loadings)):
            ax.text(abs_val, i, f" {val:.3f}", 
                    va='center', ha='left', fontsize=9)

        # Formatting
        ax.set_yticks(y_pos)
        ax.set_yticklabels(sorted_names)
        ax.invert_yaxis()  # Highest magnitude at top
        ax.set_xlabel("Loading Magnitude (Absolute Value)")
        ax.set_title(title or f"PC{pc_index+1} Loadings (Top {top_n})")

    # Plot vertical bars
    else:
        x_pos = np.arange(len(sorted_names))
        colors = [pos_color if val >= 0 else neg_color for val in sorted_loadings]

        # Plot bars
        bars = ax.bar(x_pos, abs_loadings, color=colors, width=0.8)

        # Annotate values
        for i, (val, abs_val) in enumerate(zip(sorted_loadings, abs_loadings)):
            ax.text(i, abs_val, f"{val:.3f}", 
                    ha='center', va='bottom', fontsize=9, rotation=90)

        # Formatting
        ax.set_xticks(x_pos)
        ax.set_xticklabels(sorted_names, rotation=90)
        ax.set_ylabel("Loading Magnitude (Absolute Value)")
        ax.set_title(title or f"PC{pc_index+1} Loadings (Top {top_n})")

    # Legend
    legend_elements = [
        Patch(facecolor=pos_color, label="Positive Loading"),
        Patch(facecolor=neg_color, label="Negative Loading")
    ]
    ax.legend(handles=legend_elements, loc='lower right')

    plt.tight_layout()
    return fig, ax