In [9]:
import requests
from bs4 import BeautifulSoup
import re
import glob
import pandas as pd
import csv

gene_name = 'DPP10'
row_num = 100
species = 'Homo sapiens' # 'Mus musculus', 'Homo sapiens'

def find_dataset_ids_for_gene(gene_name):
    # Construct the URL with the specified gene name
    url = f"http://www.licpathway.net/KnockTFv2/search/search_tf_result.php?tf_name={gene_name}"
    
    # Send a request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Search for the Dataset ID pattern
        dataset_ids = set()
        for link in soup.find_all('a', href=True):
            matches = re.findall(r'DataSet_\d+_\d+', link['href'])
            for match in matches:
                dataset_ids.add(match)
    
        if dataset_ids:
            dataset_ids = list(dataset_ids)
            print("Dataset IDs found:", dataset_ids)
            return dataset_ids
        else:
            print("Dataset IDs not found.")
            return []
    else:
        print("Failed to retrieve the webpage.")
        return []

dataset_ids = find_dataset_ids_for_gene(gene_name)


for data_id in dataset_ids:
    params = {
        'species': species,
        'sample_tf_name': gene_name,
        'sample_id': data_id,
        'sel_row_num': row_num,  # Number of top genes
        'sort_meth': 'abs'     # Sorting method ('abs', 'up', 'down')
    }
    # Define the URL and parameters
    url = 'http://www.licpathway.net/KnockTFv2/search/sample_result_figure_rank/figure_rank.php'
    
    # Send the GET request
    response = requests.get(url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
    else:
        print(f"Error fetching data: {response.status_code}")
        exit()

    # Process the data
    data_network = [['T(co)F', 'Target', 'log2FC']]

    # Extract the TF name
    tf_name = data[0]['figure_rank_50_name'][0]['name']

    # Loop through the target genes
    for element in data[0]['figure_rank_50_name'][1:]:
        row = [tf_name, element['name'], element['log2FC']]
        data_network.append(row)

    # Write the data to a CSV file
    with open(f'data_{gene_name}_{data_id}.csv', 'w', newline='') as f_output:
        csv_output = csv.writer(f_output)
        csv_output.writerows(data_network)

# Now get all the generated CSV files and combine them into a single file

df = pd.DataFrame()
for file in glob.glob(f'data_{gene_name}_*.csv'):
    df = pd.concat([df, pd.read_csv(file)], ignore_index=True)


Dataset IDs not found.


In [3]:
df.sort_values(by='log2FC', ascending=False, inplace=True)
df.to_csv(f'{gene_name}_data.csv', index=False)

In [4]:
df


Unnamed: 0,T(co)F,Target,log2FC
0,TP53,MAGEA2,19.787970
101,TP53,FOSB,7.587465
13,TP53,C6orf47,7.063986
102,TP53,CD163L1,6.894818
103,TP53,EGR3,6.392317
...,...,...,...
4,TP53,CDKN1A,-8.224417
3,TP53,LIN7A,-8.265202
2,TP53,GABBR2,-9.145347
100,TP53,EDA2R,-9.321176


In [5]:
#extract the highest 10 log2FC genes and write them to a new CSV file
df.head(50).to_csv(f'KnockTF_{gene_name}_top50.csv', index=False)