In [None]:
import numpy as np
import pandas as pd
import os
from google.colab import drive

In [None]:
# Step 1: Mount Google Drive in Colab
drive.mount('/content/drive')

# Step 2: Define the top-level Drive folder
base_folder = '/content/drive/MyDrive/Prostate_Cancer_Sub_typing/Prostate_machine_learning_python/'

# Step 3: List of miRNAs to extract
miRNA_list = [
    'hsa-mir-200c',
    'hsa-mir-145', 'hsa-mir-221',
    'hsa-mir-222']

# Step 4: Function to extract CPMs from one .txt file
def extract_miRNA_cpm(file_path, miRNAs):
    """
    Reads a .txt file and extracts 'reads_per_million_miRNA_mapped' for the specified miRNAs.
    Returns a dict with all miRNAs, filling missing ones with 0.
    """
    try:
        df = pd.read_csv(file_path, sep='\t')

        # Ensure required columns are present
        if 'miRNA_ID' not in df.columns or 'reads_per_million_miRNA_mapped' not in df.columns:
            raise ValueError("Missing required columns")

        df_filtered = df[df['miRNA_ID'].isin(miRNAs)]
        grouped = df_filtered.groupby('miRNA_ID')['reads_per_million_miRNA_mapped'].sum()
        return {miRNA: float(grouped.get(miRNA, 0.0)) for miRNA in miRNAs}

    except Exception as e:
        print(f"Skipping file {file_path}: {e}")
        return None  # Skip invalid files

# Step 5: Recursively find valid .txt files
txt_file_paths = []
for root, _, files in os.walk(base_folder):
    for f in files:
        if f.endswith('.txt') and 'annotation' not in f.lower():
            txt_file_paths.append(os.path.join(root, f))

# Step 6: Process each file
data_rows = []
file_ids = []

for file_path in txt_file_paths:
    result = extract_miRNA_cpm(file_path, miRNA_list)
    if result is not None:
        data_rows.append(result)
        file_ids.append(os.path.basename(file_path))  # Change to file_path for full path as ID

# Step 7: Create final DataFrame
df_final = pd.DataFrame(data_rows, index=file_ids)
df_final.index.name = 'File_ID'

# Preview
df_final.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0_level_0,hsa-mir-200c,hsa-mir-145,hsa-mir-221,hsa-mir-222
File_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1524814b-437a-425e-9da5-f904f2208f0b.mirbase21.isoforms.quantification.txt,6121.381354,11775.53849,340.110265,86.284628
6ae0971f-cf75-4a6b-80da-c406262bce5f.mirbase21.isoforms.quantification.txt,24027.638455,20693.631833,442.121456,171.17534
25162bd0-6b90-44a6-aedb-93dbaab69b20.mirbase21.isoforms.quantification.txt,25333.854374,3934.611438,204.139149,55.34715
45d3eddf-1715-4494-b18b-201ac5b20f08.mirbase21.isoforms.quantification.txt,21537.024492,3195.138241,142.591662,35.131278
b6a5f3fa-ce69-4fff-bf3c-1e6f2a6f610f.mirbase21.isoforms.quantification.txt,12286.823178,2914.529931,289.975351,66.352686


In [None]:
# Lets check whether the data frame has all the features and the correct number of rows
df_final.shape

(498, 4)

In [None]:
# Lets check for null values
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 498 entries, 1524814b-437a-425e-9da5-f904f2208f0b.mirbase21.isoforms.quantification.txt to 5c620b82-59e7-4821-ae20-624423486200.mirbase21.isoforms.quantification.txt
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   hsa-mir-200c  498 non-null    float64
 1   hsa-mir-145   498 non-null    float64
 2   hsa-mir-221   498 non-null    float64
 3   hsa-mir-222   498 non-null    float64
dtypes: float64(4)
memory usage: 19.5+ KB


In [None]:
# Exporting the raw data frame
df_final.to_csv('df_raw.csv')
from google.colab import files
files.download('df_raw.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Rounding all the miRNA expression columns to 3 decimals
for column in miRNA_list:
  df_final[column] = np.round(df_final[column], decimals=3)
# Check the rounding
df_final.head()

Unnamed: 0_level_0,hsa-mir-200c,hsa-mir-145,hsa-mir-221,hsa-mir-222
File_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1524814b-437a-425e-9da5-f904f2208f0b.mirbase21.isoforms.quantification.txt,6121.381,11775.538,340.11,86.285
6ae0971f-cf75-4a6b-80da-c406262bce5f.mirbase21.isoforms.quantification.txt,24027.638,20693.632,442.121,171.175
25162bd0-6b90-44a6-aedb-93dbaab69b20.mirbase21.isoforms.quantification.txt,25333.854,3934.611,204.139,55.347
45d3eddf-1715-4494-b18b-201ac5b20f08.mirbase21.isoforms.quantification.txt,21537.024,3195.138,142.592,35.131
b6a5f3fa-ce69-4fff-bf3c-1e6f2a6f610f.mirbase21.isoforms.quantification.txt,12286.823,2914.53,289.975,66.353


In [None]:
df_final.to_csv('df_rounded.csv')
from google.colab import files

files.download('df_rounded.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>