In [None]:
!pip install pybiomart

import pandas as pd
import pybiomart

combined_df = pd.read_csv('path to combined_diseases.tsv', sep='\t')

disease_counts = []
for filename in combined_df['Filename'].unique():
    subset = combined_df[(combined_df['Filename'] == filename) & (combined_df['Disease'] != '-')]
    counts = subset['Disease'].value_counts()
    disease_counts.append(counts)

# Create a dictionary to store the counts for each filename
filename_disease_counts = {}
for i in range(len(combined_df['Filename'].unique())):
  filename_disease_counts[combined_df['Filename'].unique()[i]] = disease_counts[i]


# Create a new column in the combined_df with disease counts
combined_df['DiseaseCounts'] = combined_df['Filename'].map(filename_disease_counts)
combined_df

dd_df = combined_df.drop_duplicates(subset=['Filename'])

dd_df = dd_df.rename(columns={'Filename': 'Gene name'})



# Initialize the biomart server and dataset
server = pybiomart.Server(host='http://www.ensembl.org')
dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']

genetable=dataset.query(attributes= ['external_gene_name', 'chromosome_name', 'start_position', 'end_position'])


# Merge the two DataFrames based on the common column
merged_df = pd.merge(dd_df, genetable, on='Gene name', how='inner') # Use 'inner' or other join types as needed

merged_df = merged_df.drop(columns=['Disease'])


# Create a list of valid chromosome names
valid_chromosomes = [str(i) for i in range(1, 23)] + ['X', 'Y']

# Filter the DataFrame based on valid chromosomes
merged_df = merged_df[merged_df['Chromosome/scaffold name'].isin(valid_chromosomes)]

# Add 'chr' to the beginning of the chromosome names
merged_df['Chromosome/scaffold name'] = 'chr' + merged_df['Chromosome/scaffold name'].astype(str)



bed_df = pd.DataFrame()
bed_df['chr'] = merged_df['Chromosome/scaffold name']
bed_df['start'] = merged_df['Gene start (bp)'].astype(int)
bed_df['end'] = merged_df['Gene end (bp)'].astype(int)
bed_df['name'] = merged_df['Gene name']


# Function to collapse dictionary values into a single string
def collapse_dict(d):
  return ';'.join([f"{k}:{v}" for k, v in d.items()])

# Function to calculate the sum of the counts from the dictionaries
def sum_counts(d):
  #print(d.tolist())
  return sum(d.tolist())
bed_df['name'] = merged_df['Gene name'] +'-'+merged_df['DiseaseCounts'].apply(collapse_dict)

#bed_df['values'] = merged_df2['DiseaseCounts'].apply(collapse_dict)
bed_df['score'] = merged_df['DiseaseCounts'].apply(sum_counts)


#Save to a bed file
bed_df.to_csv('output.bed', sep='\t', index=False, header=False)
