In [None]:
import requests
import random
import pandas as pd
import time

def fetch_random_hogs(taxonomic_level, num_groups=10):
	"""
	Fetch random Hierarchical Orthologous Groups (HOGs) from the OMA API
	
	Parameters:
	-----------
	taxonomic_level : str
		The taxonomic level to fetch HOGs for (e.g., 'Mammalia', 'Primates')
	num_groups : int
		Number of random HOGs to fetch
	
	Returns:
	--------
	list of dictionaries containing HOG information
	"""
	# Base URL for the OMA REST API
	base_url = "https://omabrowser.org/api"
	
	
	hog_orthologs = "/api/protein/{entry_id}/hog_derived_orthologs/"
	
	# First fetch all available HOGs at the specified taxonomic level
	hogs_url = f"{base_url}/hog/?level={taxonomic_level}&per_page={num_groups}"
	
	print(f"Fetching HOGs at taxonomic level: {taxonomic_level}")
	response = requests.get(hogs_url)
	
	if not response.ok:
		print(f"Error fetching HOGs: {response.status_code}")
		return []
	
	all_hogs = response.json()
	print(f"Total HOGs found: {len(all_hogs)}")
	print(f"Found {len(all_hogs)} HOGs at level {taxonomic_level}")
	
	print(all_hogs)  # Display the fetched HOGs for debugging
	# Randomly select the required number of HOGs
	if len(all_hogs) <= num_groups:
		selected_hogs = all_hogs
	else:
		selected_hogs = random.sample(all_hogs, num_groups)
	
	# Fetch detailed information for each selected HOG
	hog_details = []
	for i, hog in enumerate(selected_hogs):
		hog_id = hog['hog_id']
		members_url = hog['members_url']
		detail_response = requests.get(members_url)
		print(f"Fetching details for HOG {hog_id} ({i+1}/{num_groups})")
		if detail_response.ok:
			hog_details.append(detail_response.json())
			print( detail_response.json() )  # Display the fetched details for debugging
		else:
			print(f"Error fetching details for HOG {hog_id}: {detail_response.status_code}")
		
		

		# Be nice to the API by adding a small delay between requests
		time.sleep(0.5)
	
	return hog_details

# Example usage
taxonomic_level = "Metazoa"  # You can change this to any valid taxonomic level
num_groups = 5  # Number of random orthologous groups to fetch

# Fetch the data
hog_data = fetch_random_hogs(taxonomic_level, num_groups)

# Convert to a DataFrame for easier analysis
if hog_data:
	# Extract some key information from each HOG
	hog_summary = []
	for hog in hog_data:
		entry = {
			'HOG ID': hog.get('id', ''),
			'Level': hog.get('level', ''),
			'Number of genes': len(hog.get('members', [])),
			'Number of species': len(set([member.get('species', {}).get('code', '') 
										 for member in hog.get('members', [])]))
		}
		hog_summary.append(entry)
	
	hog_df = pd.DataFrame(hog_summary)
	print("\nSummary of fetched HOGs:")
	display(hog_df)
else:
	print("No HOG data retrieved")

Fetching HOGs at taxonomic level: Metazoa
Total HOGs found: 5
Found 5 HOGs at level Metazoa
[{'hog_id': 'HOG:E0789001', 'level': 'Metazoa', 'levels_url': 'https://omabrowser.org/api/hog/HOG:E0789001/?level=Metazoa', 'members_url': 'https://omabrowser.org/api/hog/HOG:E0789001/members/?level=Metazoa', 'roothog_id': 789001, 'completeness_score': 0.006000000052154064, 'description': 'Reverse transcriptase domain-containing protein', 'nr_genes': 4.0, 'similar_profile_hogs': 'https://omabrowser.org/api/hog/789001/similar_profile_hogs/'}, {'hog_id': 'HOG:E0789002', 'level': 'Metazoa', 'levels_url': 'https://omabrowser.org/api/hog/HOG:E0789002/?level=Metazoa', 'members_url': 'https://omabrowser.org/api/hog/HOG:E0789002/members/?level=Metazoa', 'roothog_id': 789002, 'completeness_score': 0.006000000052154064, 'description': '-', 'nr_genes': 2.0, 'similar_profile_hogs': 'https://omabrowser.org/api/hog/789002/similar_profile_hogs/'}, {'hog_id': 'HOG:E0789003', 'level': 'Metazoa', 'levels_url': 'h

Unnamed: 0,HOG ID,Level,Number of genes,Number of species
0,,Metazoa,4,2
1,,Metazoa,2,2
2,,Metazoa,23,19
3,,Metazoa,2,2
4,,Metazoa,20,5


In [None]:
def run_site_lielihood_analysis(hog_df):
    """
    Placeholder function for running site likelihood analysis.
    This function should be implemented based on specific requirements.
    """
    print("Running site likelihood analysis...")
    #raxml command is  --force --evaluate --msa your_alignment.phy --model GTR+G --tree fixed_tree.newick --site-lh

def run_ft2treebuilder(pdb_file, model_path, mafftmat, submat, output_dir, raxml_path):
	"""Run ft2treebuilder for a single PDB family"""
	family_name = Path(pdb_file).stem
	family_outdir = os.path.join(output_dir, family_name)
	os.makedirs(family_outdir, exist_ok=True)
	
	cmd = f"python ../ft2treebuilder.py --model {model_path} --mafftmat {mafftmat} --submat {submat} --structures '{pdb_file}' --outdir {family_outdir} --raxmlpath {raxml_path}"
	print(f"Running: {cmd}")
	subprocess.run(cmd, shell=True, check=True)
	
	# Return the family output directory and name for ML tree generation
	return family_outdir, family_name

def run_mafft_alignment(family_outdir, family_name, raxml_path):
    """Run MAFFT alignment for a given family using amino  acid sequences"""
	cmd = f"mafft --auto --thread 2 {os.path.join(family_outdir, 'encoded.fasta')} > {os.path.join(family_outdir, 'aligned.fasta')}"
    print(f"Running MAFFT alignment: {cmd}")
    subprocess.run(cmd, shell=True, check=True)
    return os.path.join(family_outdir, 'aligned.fasta')
    
def run_ml_tree(family_outdir, family_name, raxml_path):
	"""Run a standard maximum likelihood tree using RAxML-NG"""
	# Find the encoded fasta output
	encoded_fasta = os.path.join(family_outdir, 'encoded.fasta')
	if not os.path.exists(encoded_fasta):
		# Try to find it if not in expected location
		candidates = list(Path(family_outdir).glob('*.fasta'))
		if candidates:
			encoded_fasta = str(candidates[0])
	
	# Run ML tree if encoded fasta exists
	if os.path.exists(encoded_fasta):
		ml_tree_prefix = os.path.join(family_outdir, 'mltree')
		ml_cmd = f"{raxml_path} --msa {encoded_fasta} --model GTR+G --prefix {ml_tree_prefix} --threads 2 --seed 12345 --redo"
		print(f"Running ML tree: {ml_cmd}")
		subprocess.run(ml_cmd, shell=True, check=True)
		return True
	else:
		print(f"Encoded fasta not found for {family_name}, skipping ML tree.")
		return False



In [None]:
#marker gene import 

#map to uniprot


#create supermatrix using alignment
#mafft with ft2
#mafft normal
#foldmason


#concat and run raxmlng for each super

#visualize the trees

#calculate likelihood scores for each column


