In [2]:
%run '/home/christianl/Zhang-Lab/Zhang Lab Code/Boilerplate/Fig_config_utilities.py'

<class 'numpy.ndarray'> (3187, 16101)
<class 'numpy.ndarray'> (3187, 16101)


In [4]:
"""
Heavy-Tailed Residual Analysis for Gene Expression Prediction

This script identifies catastrophically wrong predictions (heavy tails) in gene expression
models and investigates whether certain gene properties (variance, mean expression) 
correlate with prediction errors.

Author: Bioinformatics Analysis
Purpose: Diagnose heavy-tailed residuals in deep learning gene expression models
"""

import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [7]:
# =============================================================================
# MAIN ANALYSIS CLASS
# =============================================================================

class HeavyTailAnalyzer:
    """
    Analyzes heavy-tailed residuals in gene expression predictions.
    
    Tests the hypothesis: "Most predictions are good, but a few genes have 
    catastrophically wrong predictions, creating heavy tails in the residual 
    distribution."
    """
    
    def __init__(self, residuals, gene_ids, expression_matrix, model_name="Model"):
        """
        Initialize the analyzer with residuals and gene information.
        
        Parameters
        ----------
        residuals : array-like, shape (n_predictions,)
            Pre-calculated residuals: y_true - y_pred, flattened to 1D
            
        gene_ids : array-like, shape (n_predictions,)
            Gene identifier for each prediction (same length as residuals)
            Example: ['GENE1', 'GENE1', 'GENE1', 'GENE2', 'GENE2', ...]
            
        expression_matrix : pd.DataFrame, shape (n_genes, n_samples)
            Expression values with genes as rows and samples as columns
            Used to calculate gene-level statistics (mean, variance, CV)
            
        model_name : str
            Name of the model being analyzed (for reporting)
        
        Example
        -------
        >>> residuals = y_true.ravel() - y_pred.ravel()
        >>> gene_ids = np.repeat(gene_names, n_samples)
        >>> analyzer = HeavyTailAnalyzer(residuals, gene_ids, expr_matrix, "KG-RNN")
        """
        self.residuals = np.array(residuals).ravel()
        self.gene_ids = np.array(gene_ids).ravel()
        self.model_name = model_name
        
        # Validate inputs
        if len(self.residuals) != len(self.gene_ids):
            raise ValueError(
                f"Residuals ({len(self.residuals)}) and gene_ids ({len(self.gene_ids)}) "
                "must have the same length"
            )
        
        # Store basic info
        self.n_predictions = len(self.residuals)
        self.n_unique_genes = len(np.unique(self.gene_ids))
        
        print(f"\n{'='*70}")
        print(f"INITIALIZING ANALYZER: {self.model_name}")
        print(f"{'='*70}")
        print(f"Total predictions: {self.n_predictions:,}")
        print(f"Unique genes: {self.n_unique_genes:,}")
        print(f"Mean residual: {np.mean(self.residuals):.6f}")
        print(f"Std residual: {np.std(self.residuals):.4f}")
        print(f"Kurtosis: {stats.kurtosis(self.residuals):.2f} "
              f"({'heavy tails' if stats.kurtosis(self.residuals) > 3 else 'normal'})")
        
        # Create the main dataframe
        self._build_dataframe(expression_matrix)
    
    
    def _build_dataframe(self, expression_matrix):
        """Build the internal dataframe with residuals and gene properties."""
        # Start with residuals
        self.data = pd.DataFrame({
            'gene_id': self.gene_ids,
            'residual': self.residuals,
            'abs_residual': np.abs(self.residuals)
        })
        
        # Add gene-level statistics from expression matrix
        gene_stats = self._calculate_gene_statistics(expression_matrix)
        self.data = self.data.merge(gene_stats, on='gene_id', how='left')
        
        # Create gene-level aggregates
        self.gene_summary = self._aggregate_by_gene()
    
    
    def _calculate_gene_statistics(self, expression_matrix):
        """
        Calculate mean, variance, and coefficient of variation for each gene.
        
        Returns DataFrame with columns: gene_id, mean_expr, variance, cv
        """
        print(f"\nCalculating gene-level statistics...")
        
        gene_stats = pd.DataFrame({
            'gene_id': expression_matrix.index,
            'mean_expr': expression_matrix.mean(axis=1),
            'variance': expression_matrix.var(axis=1),
            'cv': expression_matrix.std(axis=1) / (expression_matrix.mean(axis=1) + 1e-10)
        })
        
        print(f"  Mean expression range: [{gene_stats['mean_expr'].min():.2f}, "
              f"{gene_stats['mean_expr'].max():.2f}]")
        print(f"  Variance range: [{gene_stats['variance'].min():.2f}, "
              f"{gene_stats['variance'].max():.2f}]")
        
        return gene_stats
    
    
    def _aggregate_by_gene(self):
        """Aggregate residuals at the gene level."""
        gene_agg = self.data.groupby('gene_id').agg({
            'residual': ['mean', 'std'],
            'abs_residual': 'mean',
            'mean_expr': 'first',
            'variance': 'first',
            'cv': 'first'
        })
        
        # Flatten column names
        gene_agg.columns = ['_'.join(col).strip('_') for col in gene_agg.columns]
        gene_agg = gene_agg.reset_index()
        
        # Count predictions per gene
        counts = self.data.groupby('gene_id').size().reset_index(name='n_predictions')
        gene_agg = gene_agg.merge(counts, on='gene_id')
        
        return gene_agg
    
    
    # =========================================================================
    # TAIL IDENTIFICATION
    # =========================================================================
    
    def identify_heavy_tails(self, threshold_sigma=2.5):
        """
        Identify residuals in the heavy tails of the distribution.
        
        Definition: Tail residuals are those beyond ±threshold_sigma standard 
        deviations from the mean.
        
        Parameters
        ----------
        threshold_sigma : float, default=2.5
            Number of standard deviations to define tail boundary
            Common values: 2.0 (95%), 2.5 (98.8%), 3.0 (99.7%)
        
        Returns
        -------
        pd.DataFrame
            Subset of predictions in the tails, sorted by absolute residual
        """
        print(f"\n{'='*70}")
        print(f"IDENTIFYING HEAVY TAILS (±{threshold_sigma}σ)")
        print(f"{'='*70}")
        
        # Calculate tail boundaries
        mean_residual = np.mean(self.residuals)
        std_residual = np.std(self.residuals)
        lower_bound = mean_residual - threshold_sigma * std_residual
        upper_bound = mean_residual + threshold_sigma * std_residual
        
        print(f"Mean residual: {mean_residual:.6f}")
        print(f"Std residual: {std_residual:.4f}")
        print(f"Lower tail boundary: {lower_bound:.4f}")
        print(f"Upper tail boundary: {upper_bound:.4f}")
        
        # Flag tail residuals
        self.data['is_tail'] = (
            (self.data['residual'] < lower_bound) | 
            (self.data['residual'] > upper_bound)
        )
        
        # Extract tail data
        tail_data = self.data[self.data['is_tail']].copy()
        tail_data = tail_data.sort_values('abs_residual', ascending=False)
        
        # Report statistics
        n_tail = len(tail_data)
        pct_tail = 100 * n_tail / self.n_predictions
        n_genes_in_tail = tail_data['gene_id'].nunique()
        
        print(f"\nRESULTS:")
        print(f"  Tail predictions: {n_tail:,} ({pct_tail:.2f}% of total)")
        print(f"  Body predictions: {self.n_predictions - n_tail:,} "
              f"({100 - pct_tail:.2f}% of total)")
        print(f"  Unique genes affected: {n_genes_in_tail} "
              f"({100*n_genes_in_tail/self.n_unique_genes:.1f}% of genes)")
        
        # Show worst predictions
        print(f"\n  Top 10 worst predictions:")
        worst_10 = tail_data.nlargest(10, 'abs_residual')
        for idx, row in worst_10.iterrows():
            print(f"    {row['gene_id']}: residual = {row['residual']:.4f}, "
                  f"variance = {row['variance']:.2f}")
        
        self.tail_data = tail_data
        self.threshold_sigma = threshold_sigma
        
        return tail_data
    
    
    def compare_tail_vs_body(self):
        """
        Compare gene properties between tail and body residuals.
        
        Tests whether genes with catastrophic predictions (tails) have 
        different characteristics than genes with good predictions (body).
        
        Uses Mann-Whitney U test (non-parametric) to compare:
        - Mean expression level
        - Expression variance
        - Coefficient of variation
        
        Returns
        -------
        dict
            Dictionary with test results and summary statistics
        """
        if 'is_tail' not in self.data.columns:
            raise ValueError("Must run identify_heavy_tails() first")
        
        print(f"\n{'='*70}")
        print(f"COMPARING TAIL vs BODY PROPERTIES")
        print(f"{'='*70}")
        
        # Split data
        tail = self.data[self.data['is_tail']]
        body = self.data[~self.data['is_tail']]
        
        results = {}
        
        # --- Compare residual magnitudes ---
        print(f"\n1. RESIDUAL MAGNITUDES")
        print(f"   {'Tail':8s} | {'Body':8s} | Difference")
        print(f"   {'-'*8} | {'-'*8} | {'-'*10}")
        
        tail_mean_abs = tail['abs_residual'].mean()
        body_mean_abs = body['abs_residual'].mean()
        print(f"   {tail_mean_abs:8.4f} | {body_mean_abs:8.4f} | "
              f"{tail_mean_abs/body_mean_abs:.2f}x larger")
        
        results['abs_residual_tail'] = tail_mean_abs
        results['abs_residual_body'] = body_mean_abs
        
        # --- Compare gene properties ---
        properties = [
            ('mean_expr', 'Mean Expression'),
            ('variance', 'Variance'),
            ('cv', 'Coefficient of Variation')
        ]
        
        for col, label in properties:
            print(f"\n2. {label.upper()}")
            print(f"   {'Metric':20s} | {'Tail':10s} | {'Body':10s}")
            print(f"   {'-'*20} | {'-'*10} | {'-'*10}")
            
            # Summary statistics
            tail_mean = tail[col].mean()
            tail_median = tail[col].median()
            body_mean = body[col].mean()
            body_median = body[col].median()
            
            print(f"   {'Mean':20s} | {tail_mean:10.4f} | {body_mean:10.4f}")
            print(f"   {'Median':20s} | {tail_median:10.4f} | {body_median:10.4f}")
            
            # Statistical test
            tail_values = tail[col].dropna()
            body_values = body[col].dropna()
            
            if len(tail_values) > 0 and len(body_values) > 0:
                statistic, p_value = stats.mannwhitneyu(
                    tail_values, body_values, alternative='two-sided'
                )
                
                # Interpret p-value
                if p_value < 0.001:
                    sig = "***"
                    interp = "HIGHLY SIGNIFICANT"
                elif p_value < 0.01:
                    sig = "**"
                    interp = "SIGNIFICANT"
                elif p_value < 0.05:
                    sig = "*"
                    interp = "SIGNIFICANT"
                else:
                    sig = "ns"
                    interp = "NOT SIGNIFICANT"
                
                print(f"   {'Mann-Whitney U':20s} | p = {p_value:.4e} {sig}")
                print(f"   {'Interpretation':20s} | {interp}")
                
                results[f'{col}_p_value'] = p_value
                results[f'{col}_significant'] = p_value < 0.05
            else:
                print(f"   Insufficient data for statistical test")
        
        return results
    
    
    # =========================================================================
    # PERFORMANCE STRATIFICATION
    # =========================================================================
    
    def stratify_performance(self, y_true_by_gene, y_pred_by_gene, 
                            property='variance', n_bins=5):
        """
        Stratify model performance by gene property.
        
        This tests whether prediction accuracy correlates with gene characteristics.
        For example: "Do high-variance genes have worse R² scores?"
        
        Parameters
        ----------
        y_true_by_gene : dict
            Dictionary mapping gene_id -> array of true values
            Example: {'GENE1': array([5.2, 6.1, ...]), 'GENE2': array([...])}
            
        y_pred_by_gene : dict
            Dictionary mapping gene_id -> array of predicted values
            Same structure as y_true_by_gene
            
        property : str, default='variance'
            Gene property to bin by: 'variance', 'mean_expr', or 'cv'
            
        n_bins : int, default=5
            Number of quantile bins (e.g., 5 = quintiles)
        
        Returns
        -------
        pd.DataFrame
            Performance metrics for each bin
        """
        print(f"\n{'='*70}")
        print(f"STRATIFYING PERFORMANCE BY GENE {property.upper()}")
        print(f"{'='*70}")
        
        # Validate property
        valid_properties = ['variance', 'mean_expr', 'cv']
        if property not in valid_properties:
            raise ValueError(f"property must be one of {valid_properties}")
        
        # Calculate per-gene R² and metrics
        print(f"\nCalculating per-gene metrics...")
        gene_metrics = self._calculate_gene_metrics(y_true_by_gene, y_pred_by_gene)
        
        # Merge with gene summary
        analysis_df = self.gene_summary.merge(gene_metrics, on='gene_id', how='inner')
        
        # Create bins
        print(f"Creating {n_bins} quantile bins based on {property}...")
        try:
            analysis_df['bin'] = pd.qcut(
                analysis_df[property], 
                q=n_bins,
                labels=[f'Q{i+1}' for i in range(n_bins)],
                duplicates='drop'
            )
        except ValueError as e:
            print(f"  Warning: Could not create {n_bins} unique bins. Using fewer bins.")
            analysis_df['bin'] = pd.qcut(
                analysis_df[property], 
                q=n_bins,
                duplicates='drop'
            )
        
        # Aggregate by bin
        bin_summary = self._summarize_bins(analysis_df, property)
        
        # Test correlation
        self._test_correlation(analysis_df, property)
        
        # Store results
        self.stratification_results = {
            'property': property,
            'n_bins': n_bins,
            'bin_summary': bin_summary,
            'full_data': analysis_df
        }
        
        return bin_summary
    
    
    def _calculate_gene_metrics(self, y_true_by_gene, y_pred_by_gene):
        """Calculate R², MSE, and MAE for each gene."""
        metrics_list = []
        
        for gene_id in self.gene_summary['gene_id']:
            if gene_id not in y_true_by_gene or gene_id not in y_pred_by_gene:
                continue
            
            y_true = np.array(y_true_by_gene[gene_id])
            y_pred = np.array(y_pred_by_gene[gene_id])
            
            # Need at least 2 samples for R²
            if len(y_true) < 2:
                continue
            
            # Calculate metrics
            r2 = r2_score(y_true, y_pred)
            mse = mean_squared_error(y_true, y_pred)
            mae = mean_absolute_error(y_true, y_pred)
            
            metrics_list.append({
                'gene_id': gene_id,
                'r2': r2,
                'mse': mse,
                'mae': mae
            })
        
        print(f"  Calculated metrics for {len(metrics_list)} genes")
        return pd.DataFrame(metrics_list)
    
    
    def _summarize_bins(self, df, property):
        """Summarize performance metrics by bin."""
        summary = df.groupby('bin').agg({
            'r2': ['mean', 'std', 'median', 'min', 'max'],
            'mse': ['mean', 'median'],
            'mae': ['mean', 'median'],
            property: ['mean', 'min', 'max'],
            'gene_id': 'count'
        }).reset_index()
        
        # Flatten column names
        summary.columns = ['_'.join(col).strip('_') for col in summary.columns]
        summary.rename(columns={'gene_id_count': 'n_genes'}, inplace=True)
        
        # Print formatted table
        print(f"\n{'='*70}")
        print(f"PERFORMANCE BY {property.upper()} QUANTILE")
        print(f"{'='*70}\n")
        
        print(f"{'Bin':6s} | {'N Genes':8s} | {property:12s} | "
              f"{'R² Mean':8s} | {'R² Median':10s}")
        print(f"{'-'*6} | {'-'*8} | {'-'*12} | {'-'*8} | {'-'*10}")
        
        for _, row in summary.iterrows():
            bin_name = row['bin']
            n_genes = int(row['n_genes'])
            prop_mean = row[f'{property}_mean']
            r2_mean = row['r2_mean']
            r2_median = row['r2_median']
            
            print(f"{bin_name:6s} | {n_genes:8,d} | {prop_mean:12.4f} | "
                  f"{r2_mean:8.4f} | {r2_median:10.4f}")
        
        return summary
    
    
    def _test_correlation(self, df, property):
        """Test correlation between gene property and R²."""
        print(f"\n{'='*70}")
        print(f"CORRELATION: {property.upper()} vs R²")
        print(f"{'='*70}")
        
        # Remove NaN values
        valid_df = df.dropna(subset=[property, 'r2'])
        
        if len(valid_df) < 3:
            print("Insufficient data for correlation test")
            return
        
        # Spearman (rank-based, robust to outliers)
        spearman_r, spearman_p = spearmanr(valid_df[property], valid_df['r2'])
        
        # Pearson (linear correlation)
        pearson_r, pearson_p = pearsonr(valid_df[property], valid_df['r2'])
        
        print(f"\nSpearman Correlation (rank-based):")
        print(f"  ρ = {spearman_r:.4f}")
        print(f"  p-value = {spearman_p:.4e}")
        print(f"  Interpretation: {self._interpret_correlation(spearman_r, spearman_p)}")
        
        print(f"\nPearson Correlation (linear):")
        print(f"  r = {pearson_r:.4f}")
        print(f"  p-value = {pearson_p:.4e}")
        print(f"  Interpretation: {self._interpret_correlation(pearson_r, pearson_p)}")
    
    
    @staticmethod
    def _interpret_correlation(r, p):
        """Interpret correlation coefficient and p-value."""
        # Significance
        if p < 0.001:
            sig = "highly significant"
        elif p < 0.01:
            sig = "significant"
        elif p < 0.05:
            sig = "marginally significant"
        else:
            sig = "not significant"
        
        # Strength and direction
        abs_r = abs(r)
        if abs_r > 0.7:
            strength = "strong"
        elif abs_r > 0.4:
            strength = "moderate"
        elif abs_r > 0.2:
            strength = "weak"
        else:
            strength = "very weak"
        
        direction = "positive" if r > 0 else "negative"
        
        return f"{strength} {direction} correlation, {sig}"
    
    
    # =========================================================================
    # EXPORT & REPORTING
    # =========================================================================
    
    def export_results(self, output_dir='.', prefix='analysis'):
        """
        Export analysis results to CSV files.
        
        Parameters
        ----------
        output_dir : str, default='.'
            Directory to save files
        prefix : str, default='analysis'
            Prefix for output filenames
        
        Creates Files
        -------------
        {prefix}_{model}_samples.csv : All predictions with tail flags
        {prefix}_{model}_genes.csv : Gene-level summary statistics
        {prefix}_{model}_worst50.csv : Top 50 worst predictions
        """
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        model_safe = self.model_name.replace(' ', '_').replace('/', '-')
        
        # 1. Sample-level data
        sample_file = os.path.join(output_dir, f'{prefix}_{model_safe}_samples.csv')
        self.data.to_csv(sample_file, index=False)
        print(f"\n✓ Saved: {sample_file}")
        print(f"  ({len(self.data):,} predictions)")
        
        # 2. Gene-level summary
        gene_file = os.path.join(output_dir, f'{prefix}_{model_safe}_genes.csv')
        self.gene_summary.to_csv(gene_file, index=False)
        print(f"✓ Saved: {gene_file}")
        print(f"  ({len(self.gene_summary):,} genes)")
        
        # 3. Worst predictions
        worst_file = os.path.join(output_dir, f'{prefix}_{model_safe}_worst50.csv')
        worst_50 = self.data.nlargest(50, 'abs_residual')
        worst_50.to_csv(worst_file, index=False)
        print(f"✓ Saved: {worst_file}")
        print(f"  (Top 50 catastrophic predictions)")
        
        print(f"\nAll results exported to: {output_dir}/")

In [None]:
# =============================================================================
# USAGE EXAMPLES
# =============================================================================

def example_usage():
    """
    Demonstrates how to use the HeavyTailAnalyzer class.
    """
    print(__doc__)
    print("\n" + "="*70)
    print("EXAMPLE USAGE")
    print("="*70)
    
    print("""
    # -------------------------------------------------------------------------
    # STEP 1: Prepare your data
    # -------------------------------------------------------------------------
    
    # You should have:
    # - y_true: shape (n_samples, n_genes), e.g., (500, 1000)
    # - y_pred: shape (n_samples, n_genes)
    # - expr_matrix: shape (n_genes, n_samples), genes × samples
    # - gene_names: list of gene identifiers
    
    import numpy as np
    import pandas as pd
    from tail_analysis_clean import HeavyTailAnalyzer
    
    # Flatten predictions to 1D
    y_true_flat = y_true.ravel()  # (500 * 1000,) = (500000,)
    y_pred_flat = y_pred_rnn.ravel()
    
    # Calculate residuals
    residuals = y_true_flat - y_pred_flat
    
    # Create gene ID array (repeat each gene for all samples)
    gene_ids = np.repeat(gene_names, n_samples)  # (500000,)
    # Result: ['GENE1', 'GENE1', ..., 'GENE2', 'GENE2', ..., 'GENE1000', ...]
    
    
    # -------------------------------------------------------------------------
    # STEP 2: Initialize analyzer
    # -------------------------------------------------------------------------
    
    analyzer = HeavyTailAnalyzer(
        residuals=residuals,
        gene_ids=gene_ids,
        expression_matrix=expr_matrix,  # genes × samples
        model_name="KG-RNN"
    )
    
    
    # -------------------------------------------------------------------------
    # STEP 3: Identify heavy tails
    # -------------------------------------------------------------------------
    
    tail_data = analyzer.identify_heavy_tails(threshold_sigma=2.5)
    # Shows: number of tail predictions, affected genes, worst cases
    
    
    # -------------------------------------------------------------------------
    # STEP 4: Compare tail vs body properties
    # -------------------------------------------------------------------------
    
    comparison = analyzer.compare_tail_vs_body()
    # Tests: Do tail genes have different mean/variance/CV?
    
    
    # -------------------------------------------------------------------------
    # STEP 5: Stratify performance by gene properties
    # -------------------------------------------------------------------------
    
    # First, create per-gene dictionaries
    y_true_by_gene = {}
    y_pred_by_gene = {}
    
    for i, gene in enumerate(gene_names):
        y_true_by_gene[gene] = y_true[:, i]  # all samples for this gene
        y_pred_by_gene[gene] = y_pred_rnn[:, i]
    
    # Test if variance correlates with R²
    bin_summary = analyzer.stratify_performance(
        y_true_by_gene=y_true_by_gene,
        y_pred_by_gene=y_pred_by_gene,
        property='variance',  # or 'mean_expr' or 'cv'
        n_bins=5
    )
    
    # Try other properties
    analyzer.stratify_performance(
        y_true_by_gene, y_pred_by_gene, property='mean_expr', n_bins=5
    )
    
    
    # -------------------------------------------------------------------------
    # STEP 6: Export results
    # -------------------------------------------------------------------------
    
    analyzer.export_results(output_dir='results', prefix='kgrnn_analysis')
    # Creates: kgrnn_analysis_KG-RNN_samples.csv
    #          kgrnn_analysis_KG-RNN_genes.csv
    #          kgrnn_analysis_KG-RNN_worst50.csv
    
    
    # -------------------------------------------------------------------------
    # COMPARING MULTIPLE MODELS
    # -------------------------------------------------------------------------
    
    # Analyze each model
    analyzers = {}
    
    for model_name, predictions in [('MLR', y_pred_mlr), 
                                    ('XGBRF', y_pred_xgb),
                                    ('KG-RNN', y_pred_rnn)]:
        residuals = y_true.ravel() - predictions.ravel()
        
        analyzer = HeavyTailAnalyzer(
            residuals, gene_ids, expr_matrix, model_name
        )
        
        analyzer.identify_heavy_tails(threshold_sigma=2.5)
        analyzer.compare_tail_vs_body()
        analyzer.stratify_performance(y_true_by_gene, 
                                     {g: predictions[:, i] 
                                      for i, g in enumerate(gene_names)},
                                     property='variance')
        analyzer.export_results(output_dir='results', prefix=model_name.lower())
        
        analyzers[model_name] = analyzer
    """)


In [8]:
# loading in packages and custom analysis class
import numpy as np
import pandas as pd
from tail_analysis_clean import HeavyTailAnalyzer

ModuleNotFoundError: No module named 'tail_analysis_clean'

In [9]:
# data loading and flattening for each model 
y_true = y_test_centered.ravel()
mlr_y_pred = mlr_y_pred.ravel()
xgbrf_y_pred = xgbrf_y_pred.ravel()

# generating prediction residuals for each model
mlr_residuals = y_true - mlr_y_pred
xgbrf_residuals = y_true - xgbrf_y_pred

gene_names = list(y_train_centered.columns)  # ['FO538757.2', 'NOC2L', 'ISG15', ...]
gene_ids = np.tile(gene_names, 3187)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
mlr_analyzer = HeavyTailAnalyzer(
        residuals=mlr_residuals,
        gene_ids=gene_ids,
        expression_matrix=y_train_centered.T, # transposed 
        model_name="MLR"
    )