# Association between host gene expression and behavior
Assess to what extent gene expression in the adrenal glands (Adr) and visceral adipose tissue (VAT) predict host behavior.

## Set up

### Import packages

In [8]:
import argparse
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_predict, permutation_test_score
import numpy as np
import matplotlib
import seaborn as sns
import os

matplotlib.use('Agg')

### Set up argument parsing

In [9]:
def parse_args():
    parser = argparse.ArgumentParser(description="Run Random Forest to predict behavior from transcriptome data.")
    parser.add_argument("--output_dir", required=True, help="Directory to save output files")
    parser.add_argument("--phenotype_vars", type=str, nargs="+", help="Behavioral variables")
    parser.add_argument("--phenotype_file", type=str, required=True, help="Path to Behavioral data file")
    parser.add_argument("--tissue", type=str, required=True, choices=["Adr", "VAT"], help="Tissue type for transcriptome data (Adr or VAT)")
    parser.add_argument("--transcriptome_file", type=str, required=True, help="Path to transcriptome data file (Adr or VAT)")
    return parser.parse_args()

# Parse arguments
args = parse_args()

usage: ipykernel_launcher.py [-h] --output_dir OUTPUT_DIR
                             [--phenotype_vars PHENOTYPE_VARS [PHENOTYPE_VARS ...]]
                             --phenotype_file PHENOTYPE_FILE --tissue
                             {Adr,VAT} --transcriptome_file TRANSCRIPTOME_FILE
ipykernel_launcher.py: error: the following arguments are required: --output_dir, --phenotype_file, --tissue, --transcriptome_file


SystemExit: 2

In [30]:
# Simulate arguments for troubleshooting

class Args:
    output_dir = "output/Adr"
    phenotype_vars = ["Center_occupancy", "Grooming_duration", "Social_preference"]
    phenotype_file = "Behavior_data.txt"
    tissue = "Adr"
    transcriptome_file = "Adr_normalized_counts.txt"

args = Args()

### Set up output directory

In [31]:
output_dir = os.path.join(args.output_dir, args.tissue)
os.makedirs(output_dir, exist_ok=True)

### Load data

In [32]:
# Load phenotype metadata
phenotype = pd.read_csv(args.phenotype_file, sep="\t")

# Load selected tissue's transcriptome data
transcriptome = pd.read_csv(args.transcriptome_file, sep="\t", index_col=0).T.reset_index().rename(columns={"index": "Mouse_ID"})

In [33]:
# Data overview

# Display first 5 rows
display(phenotype.head())
display(transcriptome.head())

# Check data types and missing values
print(phenotype.info())
print(transcriptome.info())

Unnamed: 0,Mouse_ID,Sample_ID,Sex,DOB,Dame,Sire,Parents,Litter,Weaned,Experiement_Start,...,TMT,Treatment,Pair_mouse,Cage_ID,Treatment_Timepoint,Stress,Center_occupancy,Grooming_duration,Social_preference,Weight
0,2028,2028_T5,Female,01/01/2022,,,NA_NA,NA_NA_44562,31/01/2022,15/02/2022,...,H2O,Pair_H2O,2029.0,2028_2029,Pair_H2O_T5,Control,344.533333,1,0.093117,15.04
1,2029,2029_T5,Female,01/01/2022,,,NA_NA,NA_NA_44562,31/01/2022,15/02/2022,...,H2O,Pair_H2O,2028.0,2028_2029,Pair_H2O_T5,Control,121.866667,65,0.446575,16.09
2,594,594_T5,Male,01/01/2022,550.0,560.0,550_560,550_560_44562,24/01/2022,15/02/2022,...,H2O,Pair_H2O,595.0,594_595,Pair_H2O_T5,Control,195.866667,14,0.895172,20.35
3,595,595_T5,Male,01/01/2022,550.0,560.0,550_560,550_560_44562,24/01/2022,15/02/2022,...,H2O,Pair_H2O,594.0,594_595,Pair_H2O_T5,Control,201.933333,88,0.474453,20.31
4,596,596_T5,Female,01/01/2022,550.0,560.0,550_560,550_560_44562,24/01/2022,15/02/2022,...,TMT,Pair_TMT,597.0,596_597,Pair_TMT_T5,Stressor,216.566667,17,0.87976,16.18


Gene,Mouse_ID,Xkr4,Gm53491,Rp1,Sox17,Gm22307,Gm46082,Gm29874,Mrpl15,Lypla1,...,TrnC,COX1,ATP6,ND3,ND4,TrnL2,ND5,ND6,CYTB,TrnP
0,594,2.052061,2.052061,2.052061,16.416488,0.0,0.0,0.0,252.403503,69.770074,...,8.208244,6917.497632,0.0,2.052061,413379.480243,0.0,9870.413411,10525.02087,33748.195209,449.401359
1,595,3.275186,3.930223,0.655037,19.651114,0.0,0.655037,0.0,194.546026,60.263415,...,0.0,6446.875378,0.0,0.0,586866.755805,0.0,12818.421487,24771.538933,98703.61405,126.422165
2,596,5.852659,5.164111,0.0,14.803784,0.0,0.0,0.0,152.857678,61.969329,...,0.344274,8238.822253,0.688548,11.361044,145138.020516,0.0,30920.285112,34343.401976,138186.438931,104.315037
3,597,8.006223,3.002334,0.0,6.004667,0.0,0.0,0.0,274.213135,86.066897,...,8.006223,8677.744881,0.0,0.0,218419.76939,1.000778,12487.70622,11304.786781,24252.85082,955.742863
4,598,7.682467,2.743738,0.0,3.292486,0.0,0.0,0.0,151.454341,38.96108,...,1.646243,7661.065409,0.0,0.0,635012.38265,0.0,14747.04332,18490.050781,42315.025843,694.165729


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Mouse_ID             47 non-null     int64  
 1   Sample_ID            47 non-null     object 
 2   Sex                  47 non-null     object 
 3   DOB                  47 non-null     object 
 4   Dame                 45 non-null     object 
 5   Sire                 45 non-null     float64
 6   Parents              47 non-null     object 
 7   Litter               47 non-null     object 
 8   Weaned               47 non-null     object 
 9   Experiement_Start    47 non-null     object 
 10  Starting_Age         47 non-null     int64  
 11  Days_P_Wean          47 non-null     int64  
 12  Cohort               47 non-null     object 
 13  Timepoint            47 non-null     object 
 14  Sampling_date        47 non-null     object 
 15  Sampling_Age         47 non-null     int64

In [34]:
# Convert transcriptome values to numeric (force non-numeric values to NaN and handle appropriately)
transcriptome.iloc[:, 1:] = transcriptome.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')

# Select relevant columns from phenotype metadata
phenotype = phenotype[["Mouse_ID", "Treatment"] + args.phenotype_vars]  # Dynamic selection of phenotype variables

In [35]:
# Convert Mouse_ID to string in both dataframes before merging
phenotype["Mouse_ID"] = phenotype["Mouse_ID"].astype(str)
transcriptome["Mouse_ID"] = transcriptome["Mouse_ID"].astype(str)

# Merge phenotype and transcriptome data on Mouse_ID
merged = pd.merge(phenotype, transcriptome, on="Mouse_ID")

display(merged.head())

Unnamed: 0,Mouse_ID,Treatment,Center_occupancy,Grooming_duration,Social_preference,Xkr4,Gm53491,Rp1,Sox17,Gm22307,...,TrnC,COX1,ATP6,ND3,ND4,TrnL2,ND5,ND6,CYTB,TrnP
0,2028,Pair_H2O,344.533333,1,0.093117,3.862889,3.090312,0.772578,12.361246,0.0,...,0.772578,8639.738516,0.0,2.317734,362905.328813,0.0,16425.005888,25810.282064,66233.102292,505.265938
1,594,Pair_H2O,195.866667,14,0.895172,2.052061,2.052061,2.052061,16.416488,0.0,...,8.208244,6917.497632,0.0,2.052061,413379.480243,0.0,9870.413411,10525.02087,33748.195209,449.401359
2,595,Pair_H2O,201.933333,88,0.474453,3.275186,3.930223,0.655037,19.651114,0.0,...,0.0,6446.875378,0.0,0.0,586866.755805,0.0,12818.421487,24771.538933,98703.61405,126.422165
3,596,Pair_TMT,216.566667,17,0.87976,5.852659,5.164111,0.0,14.803784,0.0,...,0.344274,8238.822253,0.688548,11.361044,145138.020516,0.0,30920.285112,34343.401976,138186.438931,104.315037
4,597,Pair_TMT,118.633333,107,0.380531,8.006223,3.002334,0.0,6.004667,0.0,...,8.006223,8677.744881,0.0,0.0,218419.76939,1.000778,12487.70622,11304.786781,24252.85082,955.742863


## Random Forest

In [36]:
# Define the phenotype variables to analyze
phenotype_vars = args.phenotype_vars

# Initialize results dataframe
results_df = pd.DataFrame(columns=["Phenotype", "Permutation Test R² Score", "P-value"])
importance_df = pd.DataFrame(columns=["Phenotype", "Gene", "Importance"])

# Define cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [37]:
# Loop through each phenotype variable
for phenotype_var in phenotype_vars:
    print(f"Processing {phenotype_var} ({args.tissue})...")

    # Define the target variable
    y = merged[phenotype_var]

    # One-hot encode Treatment as a covariate
    covariates = pd.get_dummies(merged["Treatment"], drop_first=True)

    covariates = covariates.to_numpy()  # Convert to numpy array
    y = y.to_numpy()                    # Ensure y is also a numpy array


    # Residualize the phenotype variable against Treatment
    cov_model = LinearRegression()
    cov_model.fit(covariates, y)
    y_pred_cov = cov_model.predict(covariates)
    y_res = y - y_pred_cov  # Residualized target variable

    # Define transcriptomic features (excluding phenotype metadata)
    phenotype_cols = {"Mouse_ID", "Treatment"} | set(phenotype_vars)
    transcriptome_features = [col for col in merged.columns if col not in phenotype_cols]
    X_transcriptome = merged[transcriptome_features]

    # Define the Random Forest model
    rf_model = RandomForestRegressor(n_estimators=20, random_state=42, n_jobs=-1)

    # Get cross-validated predictions instead of using training predictions
    y_pred_cv = cross_val_predict(rf_model, X_transcriptome, y_res, cv=cv)

    # Perform permutation test using cross-validation
    score, permutation_scores, pvalue = permutation_test_score(
        rf_model, X_transcriptome, y_res, cv=cv, n_permutations=1000, scoring="r2", random_state=42
    )

    # Store results
    results_df = pd.concat(
        [results_df, pd.DataFrame({"Phenotype": [phenotype_var], "Permutation Test R² Score": [score], "P-value": [pvalue]})], 
        ignore_index=True
    )

    ## Feature Importance
    rf_model.fit(X_transcriptome, y_res)

    # Store results
    importance_df = pd.concat(
        [importance_df, pd.DataFrame({
        "Gene": X_transcriptome.columns,
        "Importance": rf_model.feature_importances_
    }).sort_values(by="Importance", ascending=False)]
    )

    ### Predicted vs. Actual Plot (Cross-Validation)
    slope, intercept, r_value, p_value, std_err = linregress(y_pred_cv, y_res)

    plt.figure(figsize=(4, 4))
    sns.regplot(x=y_pred_cv, y=y_res, scatter_kws={"alpha": 0.6, "color": "blue"}, line_kws={"color": "red"})
    plt.text(min(y_pred_cv) + (max(y_pred_cv) - min(y_pred_cv)) * 0.05, max(y_res) - (max(y_res) - min(y_res)) * 0.1,
             s=f"p = {p_value:.3g}", fontsize=8, color="black")
    plt.xlabel(f"Predicted Residualized {phenotype_var} ({args.tissue})", fontsize=8)
    plt.ylabel(f"Actual Residualized {phenotype_var} ({args.tissue})", fontsize=8)
    plt.title(f"Predicted vs. Actual - {phenotype_var} ({args.tissue})", fontsize=8)
    plt.savefig(os.path.join(output_dir, f"Predicted_vs_actual_{phenotype_var}_{args.tissue}.pdf"), format="pdf")
    plt.show()

    ### Feature Importance Plot
    plt.figure(figsize=(6, 4))
    sns.barplot(data=importance_df.head(10), x="Importance", y="Gene", palette="viridis")
    plt.xlabel("Feature Importance")
    plt.ylabel("Genes")
    plt.title(f"Top 10 Important Genes for Prediction ({args.tissue})")
    plt.savefig(os.path.join(output_dir, f"Feature_importance_{phenotype_var}_{args.tissue}.pdf"), format="pdf")
    plt.show()

Processing Center_occupancy (Adr)...


  results_df = pd.concat(
  importance_df = pd.concat(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=importance_df.head(10), x="Importance", y="Gene", palette="viridis")


Processing Grooming_duration (Adr)...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=importance_df.head(10), x="Importance", y="Gene", palette="viridis")


Processing Social_preference (Adr)...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=importance_df.head(10), x="Importance", y="Gene", palette="viridis")


In [38]:
# Save final results to CSV
results_df.to_csv(os.path.join(output_dir, f"RF_permutation_test_results_{args.tissue}.csv"), index=False)
importance_df.to_csv(os.path.join(output_dir, f"RF_important_features_{args.tissue}.csv"), index=False)