In [61]:
import os
import shutil
import glob
import pandas as pd
import numpy as np

In [None]:
# Base directory containing the files
base_directory = '../Dataset/Methylomics/FIN13296/iDAT'

for batch in os.listdir(base_directory):
    tmp_directory = os.path.join(base_directory, batch) 
    if batch != '.DS_Store':
        # Get all file paths starting with '207881760004' in the base directory
        file_paths = glob.glob(f'{tmp_directory}/*')

        # Iterate through each file path
        for file_path in file_paths:
            # Extract the filename from the path
            filename = os.path.basename(file_path)
            
            # Check if the filename contains more than one underscore
            if filename.count('_') > 1:
                # Extract the first two parts of the filename
                subfolder_name = filename.split('_')[0] + '_' + filename.split('_')[1]
                
                # Create the subfolder path
                subfolder_path = os.path.join(tmp_directory, subfolder_name)
                
                # Check if the file is already in a subgrouped folder
                if not os.path.dirname(file_path).endswith(subfolder_name):
                    # Create the subfolder if it doesn't exist
                    os.makedirs(subfolder_path, exist_ok=True)
                    
                    # Move the file to the subfolder
                    shutil.move(file_path, os.path.join(subfolder_path, filename))
                else:
                    print(f"File '{filename}' is already in the subgrouped folder '{subfolder_name}' and will be skipped.")
            else:
                print(f"Filename '{filename}' does not contain more than one underscore and will be skipped.")

        print("Files have been organized into subfolders.")

In [65]:
df = pd.read_csv('../Dataset/Methylomics/FIN13296/FIN13296_Infinium_EPIC_SampleSheet.csv').iloc[6:, :]

# Use the first row as column names
df.columns = df.iloc[0]

# Remove the first row
df = df.iloc[1:]

df['Sample_Group'] = df['Sentrix_ID'] + '_' + df['Sentrix_Position']

# Reset the index
df = df.reset_index(drop=True)

df.head(10)

6,Sentrix_ID,Sentrix_Position,Sample_ID,Sample_Well,Sample_Plate,Sample_Name,Pool_ID,Sample_Group
0,207881760108,R01C01,FIN13295A1,A1,207881760108_R01C01_FIN13295A1_A1,I_D_001_DNA,,207881760108_R01C01
1,207881760108,R02C01,FIN13295A2,B1,207881760108_R02C01_FIN13295A2_B1,I_D_002_DNA,,207881760108_R02C01
2,207881760108,R03C01,FIN13295A3,C1,207881760108_R03C01_FIN13295A3_C1,I_D_003_DNA,,207881760108_R03C01
3,207881760108,R04C01,FIN13295A4,D1,207881760108_R04C01_FIN13295A4_D1,I_D_004_DNA,,207881760108_R04C01
4,207881760108,R05C01,FIN13295A5,E1,207881760108_R05C01_FIN13295A5_E1,I_D_005_DNA,,207881760108_R05C01
5,207881760108,R06C01,FIN13295A6,F1,207881760108_R06C01_FIN13295A6_F1,I_D_006_DNA,,207881760108_R06C01
6,207881760108,R07C01,FIN13295A7,G1,207881760108_R07C01_FIN13295A7_G1,I_D_007_DNA,,207881760108_R07C01
7,207881760108,R08C01,FIN13295A8,H1,207881760108_R08C01_FIN13295A8_H1,I_D_008_DNA,,207881760108_R08C01
8,207881760123,R01C01,FIN13295A9,A2,207881760123_R01C01_FIN13295A9_A2,I_D_009_DNA,,207881760123_R01C01
9,207881760123,R02C01,FIN13295A10,B2,207881760123_R02C01_FIN13295A10_B2,I_D_010_DNA,,207881760123_R02C01


In [70]:
df.Sentrix_ID.unique().tolist()

['207881760108',
 '207881760123',
 '207881760119',
 '207881760106',
 '207881760128',
 '207881760121',
 '207881760037',
 '207881760097',
 '207881760129',
 '207881760120',
 '207881760117',
 '207881760004']

In [71]:
# Grouping by Sentrix_ID and saving to different dataframes
dfs = {}
for sentrix_id, group in df.groupby('Sentrix_ID'):
    dfs[sentrix_id] = group

# Saving each dataframe to a separate file
for sentrix_id, df_group in dfs.items():
    filename = f"{base_directory}/{sentrix_id}_Sample_Sheet.csv"
    df_group.to_csv(filename, index=False)

In [75]:
# Base directory containing the files
base_directory = '../Dataset/Methylomics/FIN13296/iDAT'


for batch in os.listdir(base_directory):
    if batch != '.DS_Store':
        if not batch.endswith('.csv'):
            src_path = f'{base_directory}/{batch}_Sample_Sheet.csv'
            target_path = f'{base_directory}/{batch}/Sample_Sheet.csv'
            # Move the file to the subfolder
            shutil.move(src_path, target_path)
