In [11]:
import numpy as np
import pandas as pd
from pathlib import Path

def sort_dataframe_by_explicit_list(df, column_name, explicit_order):
    sorter_index = dict(zip(explicit_order, range(len(explicit_order))))
    df['Rank'] = df[column_name].map(sorter_index)
    return df.sort_values('Rank').drop('Rank', axis=1)


# Raw data as string
data_str = """
# Data/Bapun
<5>NOTICE: Google drive root 'Data/Bapun': Size may be underestimated due to 2 objects with unknown size
Total objects: 5.615k (5615)
Total size: 20.939 TiB (23022337704558 Byte)
Total objects with unknown size: 2 (2)

# Data/Hiro:
Total objects: 8.036k (8036)
Total size: 4.991 TiB (5487246060762 Byte)

# Data/Jahangir
<5>NOTICE: Google drive root 'Data/Jahangir': Size may be underestimated due to 37 objects with unknown size
Total objects: 2.720k (2720)
Total size: 15.024 TiB (16519203892508 Byte)
Total objects with unknown size: 37 (37)

# Data/KDIBA
Total objects: 10.002k (10002)
Total size: 351.423 GiB (377337175347 Byte)

# Data/Kourosh
Total objects: 84 (84)
Total size: 312.210 GiB (335233248677 Byte)

# Data/Laurel
Total objects: 32.095k (32095)
Total size: 6.628 TiB (7287121141962 Byte)

# Data/Nat
<5>NOTICE: Google drive root 'Data/Nat': Size may be underestimated due to 49 objects with unknown size
Total objects: 88.774k (88774)
Total size: 7.444 TiB (8185217690135 Byte)
Total objects with unknown size: 49 (49)

# Data/Output
Total objects: 696 (696)
Total size: 212.664 MiB (222994516 Byte)

# Data/Rachel
Total objects: 2.061k (2061)
Total size: 1.102 TiB (1211611387062 Byte)

# Data/Utku
Total objects: 2.698k (2698)
Total size: 9.468 TiB (10409887413858 Byte)
"""

# Split the data into sections for each name
sections = [section.strip() for section in data_str.split("#") if section.strip()]

# Extract the data for each section
data = []
for section in sections:
    lines = section.split('\n')
    name = lines[0].replace(":", "").strip()
    
    # Extract total objects
    total_objects_line = [line for line in lines if "Total objects:" in line][0]
    total_objects = int(total_objects_line.split('(')[1].split(')')[0])
    
    # Extract total size and convert to GiB
    total_size_line = [line for line in lines if "Total size:" in line][0]
    size_value = float(total_size_line.split(' ')[2])
    size_unit = total_size_line.split(' ')[3]
    if size_unit == "TiB":
        size_value *= 1024  # Convert TiB to GiB
    elif size_unit == "MiB":
        size_value /= 1024  # Convert MiB to GiB
    
    # Extract notice if exists
    notice_line = [line for line in lines if "NOTICE:" in line]
    notice = notice_line[0] if notice_line else ""
    
    data.append([name, total_objects, size_value, notice])

# Convert data to DataFrame
df = pd.DataFrame(data, columns=['Name', 'Total objects', 'Total Size (GiB)', 'Notes'])
# Extracting 'Total objects with unknown size' information
for section in sections:
    lines = section.split('\n')
    name = lines[0].replace(":", "").strip()
    
    # Find the row with the given name
    row_idx = df[df['Name'] == name].index[0]
    
    # Extract total objects with unknown size
    unsizable_objects_line = [line for line in lines if "Total objects with unknown size:" in line]
    if unsizable_objects_line:
        unsizable_objects = int(unsizable_objects_line[0].split('(')[1].split(')')[0])
    else:
        unsizable_objects = 0
    
    df.at[row_idx, 'unsizable_objects'] = unsizable_objects


df = df[['Name', 'Total objects', 'Total Size (GiB)', 'unsizable_objects']]
df

sorted_df = sort_dataframe_by_explicit_list(df, 'Name', ['Data/KDIBA','Data/Utku','Data/Nat','Data/Bapun','Data/Kourosh','Data/Rachel','Data/Hiro','Data/Laurel','Data/Jahangir'])
print(sorted_df)

            Name  Total objects  Total Size (GiB)  unsizable_objects
3     Data/KDIBA          10002         351.42300                0.0
9      Data/Utku           2698        9695.23200                0.0
6       Data/Nat          88774        7622.65600               49.0
0     Data/Bapun           5615       21441.53600                2.0
4   Data/Kourosh             84         312.21000                0.0
8    Data/Rachel           2061        1128.44800                0.0
1      Data/Hiro           8036        5110.78400                0.0
5    Data/Laurel          32095        6787.07200                0.0
2  Data/Jahangir           2720       15384.57600               37.0
7    Data/Output            696           0.20768                0.0


In [9]:
df.to_clipboard(excel=True, index=False, sep=',')




In [12]:
total_combined_size_GB = df['Total Size (GiB)'].sum()
total_combined_size_GB

67834.1446796875

In [14]:
from phoglobushelpers.data_planning_helpers import get_mounted_disks_info, DiskInfo

# Usage
df = get_mounted_disks_info()
included_mounts = ['/media/MAX', '/run/media/halechr/HUUUGE', '/media/HugePort']
df = df[np.isin(df['mount_point'], included_mounts)]
df

Unnamed: 0,device,mount_point,fstype,total,used,free,percent
30,/dev/sda2,/media/HugePort,exfat,20000.538624,19271.786693,728.751931,96.4
68,/dev/sdb1,/media/MAX,ext4,15873.631351,11699.510141,3374.059508,77.6
69,/dev/sdd1,/run/media/halechr/HUUUGE,btrfs,20000.580567,5764.704481,14234.491343,28.8


In [15]:
df.free.sum()

18337.302781952

In [None]:
'/media/MAX/cloud/turbo'
'/media/MAX/cloud/Dropbox_Diba_Shared/MED-DibaLabDropbox'


In [None]:
# All refer to the same location
['/media/MAX/cloud/Dropbox_Diba_Shared/MED-DibaLabDropbox/Data',
 '/media/HugePort/Data',
 '/media/MAX/Data',
 '/run/media/halechr/HUUUGE/Data',
 '/media/MAX/cloud/turbo/Data,'
]

