In [None]:
/home/halechr/repos/PhoGlobusHelpers/filelists/GreatGDriveMigration2023/dirsize

# 2023-09-12 - New Filelist Transfer

In [19]:
from typing import List
from pathlib import Path
import numpy as np
import pandas as pd

def chunk_dataframe(df, size_limit_bytes):
    """
    Splits the dataframe into chunks based on a cumulative size in 'size_bytes' column.

    :param df: DataFrame to split
    :param size_limit_bytes: Max size for each chunk in bytes
    :return: List of dataframes
    """
    chunks = []
    current_chunk = []
    current_chunk_size = 0

    for _, row in df.iterrows():
        if current_chunk_size + row['size_bytes'] > size_limit_bytes:
            chunks.append(pd.DataFrame(current_chunk))
            current_chunk = []
            current_chunk_size = 0

        current_chunk.append(row)
        current_chunk_size += row['size_bytes']

    # Append any remaining data
    if current_chunk:
        chunks.append(pd.DataFrame(current_chunk))

    return chunks

def convert_filelist_to_new_parent(filelist_source: List[Path], original_parent_path: Path = Path(r'/media/MAX/cloud/turbo/Data'), dest_parent_path: Path = Path(r'/media/MAX/Data')):
    """ Converts a list of file paths from their current parent, specified by `original_parent_path`, to their new parent `dest_parent_path` """
    filelist_dest = []
    for path in filelist_source:
        relative_path = str(path.relative_to(original_parent_path))
        new_path = Path(dest_parent_path) / relative_path
        filelist_dest.append(new_path)
    return filelist_dest

# Load filelist from disk
# active_filelist_path = Path('/home/halechr/repo/PhoGlobusHelpers/filelists/session_results_filelist_2023-07-12.csv').resolve()
# active_filelist_path = Path('/home/halechr/repos/PhoGlobusHelpers/filelists/GreatGDriveMigration2023/dirsize/Bapun_dirsize.csv').resolve()
active_root_filelist_parent_path = Path('/home/halechr/Desktop/GreatGDriveMigration2023/dirsize').resolve()
active_filelist_files = active_root_filelist_parent_path.glob('*.csv')
filelist_dfs_list = []
filelists_list = []
for a_file in active_filelist_files:
    print(f'a_file: {a_file}')
    active_filelist_path = Path(a_file).resolve()
    try:
        user, filename_suffix = active_filelist_path.name.split('_')
        print(f'\t{user}')
        parent_user_folder = Path(f"Data/{user}")
        print(f'\t{parent_user_folder}')
        filelist_df = pd.read_csv(active_filelist_path, header=0, names=["name", "modified_dt", "size_bytes"])
        all_files_list = [parent_user_folder.joinpath(_a_file) for _a_file in filelist_df["name"]]
        filelist_df['name'] = all_files_list
        filelist_dfs_list.append(filelist_df)
        filelists_list.append(all_files_list)
    except pd.errors.ParserError:
        print(f'encountered parser error for {active_filelist_path}')
    except BaseException:
        raise

combined_filelist_df = pd.concat(filelist_dfs_list)
combined_filelist_df

a_file: /home/halechr/Desktop/GreatGDriveMigration2023/dirsize/Rachel_dirsize.csv
	Rachel
	Data/Rachel
a_file: /home/halechr/Desktop/GreatGDriveMigration2023/dirsize/Nat_dirsize.csv
	Nat
	Data/Nat
a_file: /home/halechr/Desktop/GreatGDriveMigration2023/dirsize/Utku_dirsize.csv
	Utku
	Data/Utku
a_file: /home/halechr/Desktop/GreatGDriveMigration2023/dirsize/Laurel_dirsize.csv
	Laurel
	Data/Laurel
encountered parser error for /home/halechr/Desktop/GreatGDriveMigration2023/dirsize/Laurel_dirsize.csv
a_file: /home/halechr/Desktop/GreatGDriveMigration2023/dirsize/Kourosh_dirsize.csv
	Kourosh
	Data/Kourosh
a_file: /home/halechr/Desktop/GreatGDriveMigration2023/dirsize/Bapun_dirsize.csv
	Bapun
	Data/Bapun
a_file: /home/halechr/Desktop/GreatGDriveMigration2023/dirsize/Jahngir_dirsize.csv
	Jahngir
	Data/Jahngir
a_file: /home/halechr/Desktop/GreatGDriveMigration2023/dirsize/Hiro_dirsize.csv
	Hiro
	Data/Hiro
a_file: /home/halechr/Desktop/GreatGDriveMigration2023/dirsize/KDIBA_dirsize.csv
	KDIBA
	Da

Unnamed: 0,name,modified_dt,size_bytes
0,Data/Rachel/Take 2021-11-24 11.23.05 AM.csv,2022-08-11 12:41:13,1888355
1,Data/Rachel/Take 2021-11-24 11.13.41 AM.csv,2022-08-11 12:40:48,731099
2,Data/Rachel/Take 2021-11-24 11.12.49 AM.csv,2022-08-11 12:39:44,13590
3,Data/Rachel/Take 2021-08-27 12.59.15 PM.tak,2022-02-25 13:09:10,10307644
4,Data/Rachel/Take 2021-11-24 11.34.03 AM.tak,2022-02-25 13:00:58,262879235
...,...,...,...
690,Data/Output/2023-06-01/kdiba/vvp01/two/2006-4-...,2023-06-01 02:09:10,537957
691,Data/Output/2023-06-01/kdiba/vvp01/two/2006-4-...,2023-06-01 02:09:08,748944
692,Data/Output/2023-06-01/kdiba/vvp01/two/2006-4-...,2023-06-01 02:09:03,735368
693,Data/Output/2023-06-01/kdiba/vvp01/two/2006-4-...,2023-06-01 02:08:58,500886


In [26]:
total_all_files_num_GB = combined_filelist_df.size_bytes.sum()/1e9
total_all_files_num_GB # 65467.555576481

65467.555576481

In [None]:
size_limit = 750e9  # 750GB in bytes
size_limit = 3999e9  # 4TB in bytes
chunks = chunk_dataframe(combined_filelist_df, size_limit)
num_chunks = len(chunks)
print(f'num_chunks: {num_chunks}')
# for chunk in chunks:
#     print(chunk)

Unnamed: 0,name,modified_dt,size_bytes
0,Data/Rachel/Take 2021-11-24 11.23.05 AM.csv,2022-08-11 12:41:13,1888355
1,Data/Rachel/Take 2021-11-24 11.13.41 AM.csv,2022-08-11 12:40:48,731099
2,Data/Rachel/Take 2021-11-24 11.12.49 AM.csv,2022-08-11 12:39:44,13590
3,Data/Rachel/Take 2021-08-27 12.59.15 PM.tak,2022-02-25 13:09:10,10307644
4,Data/Rachel/Take 2021-11-24 11.34.03 AM.tak,2022-02-25 13:00:58,262879235
...,...,...,...
690,Data/Output/2023-06-01/kdiba/vvp01/two/2006-4-...,2023-06-01 02:09:10,537957
691,Data/Output/2023-06-01/kdiba/vvp01/two/2006-4-...,2023-06-01 02:09:08,748944
692,Data/Output/2023-06-01/kdiba/vvp01/two/2006-4-...,2023-06-01 02:09:03,735368
693,Data/Output/2023-06-01/kdiba/vvp01/two/2006-4-...,2023-06-01 02:08:58,500886


In [None]:

filelist_df = pd.read_csv(active_filelist_path, header=0, names=["name", "modified_dt", "size_bytes"])
filelist_df


In [9]:
np.sum(filelist_df['size_bytes']/(1024.0 * 1024.0)) # Convert to GigaBytes (GB)



21955757.33845806

In [6]:
filelist_df.dtypes

name           object
modified_dt    object
size_bytes      int64
dtype: object

In [None]:
filelist_source = [Path(a_path_str).resolve() for a_path_str in filelist_df.Path]



source_parent_path = Path(r'/media/MAX/cloud/turbo/Data')
dest_parent_path = Path(r'/media/MAX/Data')
# # Build the destination filelist from the source_filelist and the two paths:
filelist_dest = convert_filelist_to_new_parent(filelist_source, original_parent_path=source_parent_path, dest_parent_path=dest_parent_path)
filelist_dest

# filelist_source
# filelist_dest