In [None]:
del df

In [244]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import json
import requests
import time


In [None]:
df = pd.read_csv("data/preprocessed_data_01.csv")

In [None]:
df.rename(columns={'Unnamed: 0':"case_id"},inplace=True)
df.set_index(df['case_id'],inplace=True)
df.drop(columns=["case_id"],inplace=True)
print("CASE_IDs are set as index")

In [None]:
df.index

In [None]:
submitter_id = []
for case_id in df.index:
    submitter_id.append(case_id[:12])
submitter_ids = set(submitter_id)

### One submitter id can have multiple samples but all the samples belongs to same primary site
### One TCGA project can have tumor data of multiple primary sites
### Primary site examples : [kindney,Lungs,Brain,Breast,etc]

In [None]:
backup_data = []

In [214]:
def generate_sample_tumor_site_data(transposed_rna_seq_df,starting_index,stopping_index,backup_data):
    print(f"Fetching data from {starting_index} to {stopping_index}")
    output_list = []
    case_id_list = []
    cases_endpt = 'https://api.gdc.cancer.gov/cases'
    fields = [
        "primary_site",
        ]
    fields = ','.join(fields)
    for i,case_id in enumerate(transposed_rna_seq_df.index):
        if i< starting_index:
            continue
        if i > stopping_index:
            break
        print("=="*50)
        print(f"Data querying for case_id {i}")
        case_id_list.append(case_id)
        filters = {
            "op": "in",
            "content":{
                "field": "submitter_id",
                "value":case_id[:12]
                }
            }

        params = {
            "filters": json.dumps(filters),
            "fields": fields,
            "format": "JSON",
            "size": "100"
            }

        response = requests.get(cases_endpt, params = params)
        response_json = response.json()
        hits = response_json.get("data", {}).get("hits", [])
        if not hits:
            print(f"No data returned for case_id: {case_id}")
            continue  # Skip if no hit is returned

        case_data = hits[0]
        primary_site = case_data.get("primary_site", "")
        primary_site = primary_site.replace(" ", "_")
        
        output_data = {
            "case_id": case_id,
            "uuid": case_data.get("id", ""),
            "tumour_site": primary_site
        }
        print(f"Data found for case_id {i}")
        print("Going to sleep")
        output_list.append(output_data)
        backup_data.append(output_data)
        time.sleep(2)


In [200]:
len(df.index)

10471

## data processed : 2200

In [None]:
generate_sample_tumor_site_data(df,5099,6000,backup_data)

In [241]:
len(backup_data)

5097

In [None]:
s = 5000
e = 6000
output_dataframe = pd.DataFrame(backup_data[s:])
output_dataframe.set_index("case_id", inplace=True)
output_dataframe.to_csv(f"data/label/sample_tumourtype_map_{s}_{e}.csv")

In [229]:
label_df = pd.read_csv("/Users/abir/Developer/pancan_project/data/label/sample_tumourtype_map_0_199.csv")

In [None]:
import pandas as pd
import os

def combine_csv_files(folder_path):
    """
    Read all CSV files in the specified folder and append them row-wise into a single DataFrame.
    
    Args:
        folder_path (str): Path to the folder containing CSV files
        
    Returns:
        pandas.DataFrame: Combined DataFrame with all data from CSV files
    """
    # Initialize an empty DataFrame to store combined data
    combined_df = pd.DataFrame()
    
    # Get a list of all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Check if any CSV files were found
    if not csv_files:
        print(f"No CSV files found in {folder_path}")
        return combined_df
    
    # Loop through each CSV file and append to the combined DataFrame
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        try:
            # Read the current CSV file
            current_df = pd.read_csv(file_path)
            
            # Append to the combined DataFrame
            combined_df = pd.concat([combined_df, current_df], ignore_index=True)
            
            print(f"Successfully read {file}")
        except Exception as e:
            print(f"Error reading {file}: {e}")
    
    print(f"Combined {len(csv_files)} CSV files into a DataFrame with {combined_df.shape[0]} rows and {combined_df.shape[1]} columns.")
    return combined_df

# Example usage


In [246]:
combined_data = combine_csv_files("/Users/abir/Developer/pancan_project/data/label")

Successfully read sample_tumourtype_map_1200_1399.csv
Successfully read sample_tumourtype_map_600_799.csv
Successfully read sample_tumourtype_map_2500_2999.csv
Successfully read sample_tumourtype_map_0_199.csv
Successfully read sample_tumourtype_map_3000_3999.csv
Successfully read sample_tumourtype_map_1000_1199.csv
Successfully read sample_tumourtype_map_800_999.csv
Successfully read sample_tumourtype_map_1800_1999.csv
Successfully read sample_tumourtype_map_2200_2499.csv
Successfully read sample_tumourtype_map_1600_1799.csv
Successfully read sample_tumourtype_map_2121_2199.csv
Successfully read sample_tumourtype_map_4000_4198.csv
Successfully read sample_tumourtype_map_4198_5000.csv
Successfully read sample_tumourtype_map_2000_2199.csv
Successfully read sample_tumourtype_map_1400_1599.csv
Successfully read sample_tumourtype_map_400_599.csv
Successfully read sample_tumourtype_map_200_399.csv
Successfully read sample_tumourtype_map_1894_1999.csv
Successfully read sample_tumourtype_map_

In [None]:
combined_data.to_pk

(5268, 3)

In [250]:

# Save DataFrame to pickle file
def save_dataframe_to_pickle(df, file_path):
    """
    Save a pandas DataFrame to a pickle file
    
    Args:
        df (pandas.DataFrame): DataFrame to save
        file_path (str): Path where the pickle file will be saved
    """
    try:
        df.to_pickle(file_path)
        print(f"DataFrame successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving DataFrame: {e}")

# Read DataFrame from pickle file
def read_dataframe_from_pickle(file_path):
    """
    Read a pandas DataFrame from a pickle file
    
    Args:
        file_path (str): Path to the pickle file
        
    Returns:
        pandas.DataFrame: DataFrame loaded from the pickle file
    """
    try:
        df = pd.read_pickle(file_path)
        print(f"DataFrame successfully loaded from {file_path}")
        return df
    except Exception as e:
        print(f"Error loading DataFrame: {e}")
        return pd.DataFrame()  # Return empty DataFrame on error

In [251]:
save_dataframe_to_pickle(df=combined_data,file_path="data/processed_data/label_data.pkl")
save_dataframe_to_pickle(df=df.iloc[:5268],file_path="data/processed_data/gene_exp_data.pkl")

DataFrame successfully saved to data/processed_data/label_data.pkl
DataFrame successfully saved to data/processed_data/gene_exp_data.pkl
