In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import mygene

# Initialize mygene object for querying
mg = mygene.MyGeneInfo()

# List of DUX4 target genes
dux4_target_genes = ["PRAMEF1", "PRAMEF12", "PRAMEF2", "RFPL1", "RFPL2", "RFPL4B", "TRIM43", "ZSCAN4"]

# Query mygene.info to convert gene names to Ensembl IDs
gene_info = mg.querymany(dux4_target_genes, scopes='symbol', fields='ensembl.gene', species='human')

# Display the result of the conversion
for info in gene_info:
    print(f"Gene: {info['query']} -> Ensembl ID: {info.get('ensembl', {}).get('gene', 'Not found')}")


def variance_thresholding(gene_matrix, threshold=0.5):
    """Select genes with variance above the given threshold."""
    variances = gene_matrix.var(axis=0)
    selected_genes = variances[variances > threshold].index
    return gene_matrix[selected_genes]

def highly_variable_genes(gene_matrix, n_genes=2000):
    """Select top N highly variable genes."""
    mean_expression = gene_matrix.mean(axis=0)
    variances = gene_matrix.var(axis=0)
    normalized_variances = variances / mean_expression
    top_genes = normalized_variances.nlargest(n_genes).index
    return gene_matrix[top_genes]

def select_biologically_relevant_genes(gene_matrix, relevant_genes):
    """Select biologically relevant genes."""
    return gene_matrix[relevant_genes]

# Load gene matrix
gene_matrix = pd.read_csv('path_to_gene_matrix.csv', index_col=0)

# Standardize the data (optional but recommended)
scaler = StandardScaler()
gene_matrix_scaled = pd.DataFrame(scaler.fit_transform(gene_matrix), index=gene_matrix.index, columns=gene_matrix.columns)

# Apply feature selection method(s)
selected_genes = variance_thresholding(gene_matrix_scaled, threshold=1.0)
# Or use highly_variable_genes method
# selected_genes = highly_variable_genes(gene_matrix_scaled, n_genes=2000)
# Or use biologically relevant genes method
# relevant_genes = ['Gene1', 'Gene2', 'Gene3', ...]  # Add your relevant genes
# selected_genes = select_biologically_relevant_genes(gene_matrix_scaled, relevant_genes)

# Save selected genes to a new file
selected_genes.to_csv('selected_genes.csv')


Gene: PRAMEF1 -> Ensembl ID: ENSG00000116721
Gene: PRAMEF12 -> Ensembl ID: ENSG00000116726


AttributeError: 'list' object has no attribute 'get'