<a href="https://colab.research.google.com/github/B3nd3R316/B3nd3R316/blob/main/local_drives_for_SSA_files_Untitled25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import os
import shutil
import hashlib
import csv
import datetime
import re
from pathlib import Path
from google.colab import drive

# 1. Mount Google Drive
print("Mounting Google Drive to /content/drive ...")
drive.mount('/content/drive')

# 2. Define the exact locations of your 4 local drives
# IMPORTANT: Colab must be connected to a LOCAL RUNTIME to access these paths!
SOURCE_DRIVES = [
    Path("/mnt/SSA"),                                        # External Drive 1 (Corrupted but readable)
    Path("/media/johan/External Photos_Files_Doc.ntfs"),     # External Drive 2
    Path("/media/johan/WD1TB1"),                             # Internal Drive 1
    Path("/mnt/bc13ba73-b922-4caf-929e-116c0bc60e45")        # Internal Drive 2 (Clean250)
]

# 3. Define the destination on Google Drive
DEST_DIR = Path("/content/drive/MyDrive/SSA_Organized_Master")

# --- Categories & Keywords ---
CATEGORIES = {
    "01_Correspondence": ["letter", "notice", "561", "437", "email", "congress", "white house", "oig", "complaint", "decision", "award", "denial"],
    "02_Timeline": ["timeline", "chronology", "log", "summary", "index", "spreadsheet"],
    "03_SSA_Records": ["poms", "hallex", "record", "1099", "statement", "benefit", "overpayment", "payment"],
    "04_Evidence_Images": ["photo", "img", "screenshot"],
    "05_Financial": ["bank", "varos", "sofi", "moneylion", "statement", "transaction"],
    "06_Legal_Pleadings": ["brief", "affidavit", "mandamus", "tort", "evidence", "exhibit", "prosecution"],
    "07_Misc": []
}

FOLDER_MAP = {
    "01_Correspondence": "CORR",
    "02_Timeline": "TIME",
    "03_SSA_Records": "REC",
    "04_Evidence_Images": "IMG",
    "05_Financial": "FIN",
    "06_Legal_Pleadings": "LEGAL",
    "07_Misc": "MISC"
}

def is_ssa_file(filepath):
    """Basic check to see if a file is SSA-related."""
    target_keywords = ["ssa", "sst", "ssdi", "ssi", "disability", "tessensohn", "561", "437", "oig", "benefit"]
    lname = filepath.name.lower()
    return any(k in lname for k in target_keywords)

def get_file_hash(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def classify_file(filename):
    lower_name = filename.lower()
    ext = Path(filename).suffix.lower()

    for cat_name, keywords in CATEGORIES.items():
        if cat_name == "04_Evidence_Images": continue
        if any(k in lower_name for k in keywords):
            return cat_name

    if ext in [".jpg", ".jpeg", ".png", ".heic", ".bmp", ".gif", ".webp"] or "screenshot" in lower_name:
        return "04_Evidence_Images"

    return "07_Misc"

def sanitize_filename(filename):
    stem = Path(filename).stem
    clean = re.sub(r'[^a-zA-Z0-9]', '_', stem)
    clean = re.sub(r'_+', '_', clean)
    return clean[:50].strip('_')

def get_date_str(file_path, filename):
    match = re.search(r'(\d{4})(\d{2})(\d{2})', filename)
    if match: return f"{match.group(1)}{match.group(2)}{match.group(3)}"

    match_dash = re.search(r'(\d{4})-(\d{2})-(\d{2})', filename)
    if match_dash: return f"{match_dash.group(1)}{match_dash.group(2)}{match_dash.group(3)}"

    try:
        mtime = os.path.getmtime(file_path)
        return datetime.datetime.fromtimestamp(mtime).strftime("%Y%m%d")
    except:
        return "00000000"

def setup_directories():
    DEST_DIR.mkdir(parents=True, exist_ok=True)
    (DEST_DIR / "00_Master_Index").mkdir(exist_ok=True)
    for cat in CATEGORIES.keys():
        (DEST_DIR / cat).mkdir(exist_ok=True)
    print(f"Created destination layout at {DEST_DIR}")

def gather_files():
    setup_directories()
    master_index_path = DEST_DIR / "00_Master_Index" / "Master_Index.csv"
    seen_hashes = {}
    total_copied = 0
    total_size_bytes = 0

    with open(master_index_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['hash', 'original_path', 'new_path', 'filename', 'ext', 'size_bytes', 'category']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for drive_path in SOURCE_DRIVES:
            if not drive_path.exists():
                print(f"Skipping {drive_path} - Not found. (Is local Colab runtime connected?)")
                continue

            print(f"--------------------------------------------------")
            print(f"Scanning Drive: {drive_path}")

            for root, dirs, files in os.walk(drive_path):
                if DEST_DIR in Path(root).parents or Path(root) == DEST_DIR:
                    continue

                for file in files:
                    file_path = Path(root) / file

                    if not is_ssa_file(file_path):
                        continue

                    try:
                        file_hash = get_file_hash(file_path)
                        size_bytes = file_path.stat().st_size
                        ext = file_path.suffix.lower()

                        if file_hash in seen_hashes:
                            # Skip exact duplicates
                            continue

                        category = classify_file(file)
                        cat_short = FOLDER_MAP[category]
                        short_desc = sanitize_filename(file)
                        date_str = get_date_str(file_path, file)

                        new_filename = f"{cat_short}_{short_desc}_{date_str}{ext}"
                        dest_folder = DEST_DIR / category
                        new_path = dest_folder / new_filename

                        # Handle identical filenames that aren't exact duplicates
                        counter = 1
                        while new_path.exists():
                            new_filename = f"{cat_short}_{short_desc}_{date_str}_{counter}{ext}"
                            new_path = dest_folder / new_filename
                            counter += 1

                        # Copy the file to Google Drive
                        shutil.copy2(file_path, new_path)

                        # Record keeping
                        seen_hashes[file_hash] = new_path
                        total_size_bytes += size_bytes
                        total_copied += 1

                        writer.writerow({
                            'hash': file_hash,
                            'original_path': str(file_path),
                            'new_path': str(new_path),
                            'filename': file,
                            'ext': ext,
                            'size_bytes': size_bytes,
                            'category': category
                        })
                        print(f"Copied: {file} -> {category}/{new_filename}")

                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

    total_gb = total_size_bytes / (1024 ** 3)
    print(f"\n==================================================")
    print(f"DONE! Successfully assembled {total_copied} SSA files.")
    print(f"Total volume copied to Google Drive: {total_gb:.2f} GB")
    print(f"Master Index saved at: {master_index_path}")

gather_files()


ModuleNotFoundError: No module named 'google.colab'

In [15]:
# Execute cell f3a7a1f8
# This will run the gather_files() function again.
# After execution, the Master_Index.csv should be populated.

# Note: This is a placeholder to indicate that cell f3a7a1f8 should be run.
# In a real Colab environment, you would manually click 'Run' on that cell.
# Since I cannot directly 'run' a specific cell by its ID, I'll provide the content
# that should be executed to trigger the gathering process.

# The content of cell f3a7a1f8 was already provided in a previous turn and has been updated.
# To truly 'execute' it, you would typically run the cell in your notebook.
# For the purpose of moving forward, let's assume f3a7a1f8 has just been executed
# and now we want to check its output/result.

# I will now provide the code to check the CSV file, assuming the execution of f3a7a1f8 was successful.

In [16]:
import pandas as pd

master_index_path = '/home/johan/GoogleDrive/SSA_Organized_Master/00_Master_Index/Master_Index.csv'

try:
    master_index_df = pd.read_csv(master_index_path)
    print(f"Contents of {master_index_path} (first 10 rows):")
    display(master_index_df.head(10))
except FileNotFoundError:
    print(f"Error: The file {master_index_path} was not found. This might indicate an issue with the DEST_DIR path or the previous run.")
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")

Error: The file /home/johan/GoogleDrive/SSA_Organized_Master/00_Master_Index/Master_Index.csv was not found. This might indicate an issue with the DEST_DIR path or the previous run.


In [None]:
import pandas as pd

master_index_path = '/content/drive/MyDrive/SSA_Organized_Master/00_Master_Index/Master_Index.csv'

try:
    master_index_df = pd.read_csv(master_index_path)
    print(f"Contents of {master_index_path}:")
    display(master_index_df.head())
except FileNotFoundError:
    print(f"Error: The file {master_index_path} was not found. Please ensure the 'gather_files' function ran successfully and created the file.")
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")

In [None]:
import os
import shutil
import hashlib
import csv
import datetime
import re
from pathlib import Path
# from google.colab import drive # Commented out as this is for cloud Colab only

# 1. Mount Google Drive - NOT NEEDED FOR LOCAL RUNTIME, ACCESS LOCAL DRIVES DIRECTLY
# print("Mounting Google Drive to /content/drive ...")
# drive.mount('/content/drive')

# 2. Define the exact locations of your 4 local drives
# IMPORTANT: Colab must be connected to a LOCAL RUNTIME to access these paths!
SOURCE_DRIVES = [
    Path("/mnt/SSA"),                                        # External Drive 1 (Corrupted but readable)
    Path("/media/johan/External Photos_Files_Doc.ntfs"),     # External Drive 2
    Path("/media/johan/WD1TB1"),                             # Internal Drive 1
    Path("/mnt/bc13ba73-b922-4caf-929e-116c0bc60e45")        # Internal Drive 2 (Clean250)
]

# 3. Define the destination on your LOCAL Google Drive sync folder
# !!! IMPORTANT: YOU MUST UPDATE THIS PATH TO YOUR ACTUAL LOCAL GOOGLE DRIVE FOLDER !!!
DEST_DIR = Path("/home/johan/GoogleDrive/SSA_Organized_Master")

# --- Categories & Keywords ---
CATEGORIES = {
    "01_Correspondence": ["letter", "notice", "561", "437", "email", "congress", "white house", "oig", "complaint", "decision", "award", "denial"],
    "02_Timeline": ["timeline", "chronology", "log", "summary", "index", "spreadsheet"],
    "03_SSA_Records": ["poms", "hallex", "record", "1099", "statement", "benefit", "overpayment", "payment"],
    "04_Evidence_Images": ["photo", "img", "screenshot"],
    "05_Financial": ["bank", "varos", "sofi", "moneylion", "statement", "transaction"],
    "06_Legal_Pleadings": ["brief", "affidavit", "mandamus", "tort", "evidence", "exhibit", "prosecution"],
    "07_Misc": []
}

FOLDER_MAP = {
    "01_Correspondence": "CORR",
    "02_Timeline": "TIME",
    "03_SSA_Records": "REC",
    "04_Evidence_Images": "IMG",
    "05_Financial": "FIN",
    "06_Legal_Pleadings": "LEGAL",
    "07_Misc": "MISC"
}

def is_ssa_file(filepath):
    """Basic check to see if a file is SSA-related."""
    target_keywords = ["ssa", "sst", "ssdi", "ssi", "disability", "tessensohn", "561", "437", "oig", "benefit"]
    lname = filepath.name.lower()
    return any(k in lname for k in target_keywords)

def get_file_hash(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def classify_file(filename):
    lower_name = filename.lower()
    ext = Path(filename).suffix.lower()

    for cat_name, keywords in CATEGORIES.items():
        if cat_name == "04_Evidence_Images": continue
        if any(k in lower_name for k in keywords):
            return cat_name

    if ext in [".jpg", ".jpeg", ".png", ".heic", ".bmp", ".gif", ".webp"] or "screenshot" in lower_name:
        return "04_Evidence_Images"

    return "07_Misc"

def sanitize_filename(filename):
    stem = Path(filename).stem
    clean = re.sub(r'[^a-zA-Z0-9]', '_', stem)
    clean = re.sub(r'_+', '_', clean)
    return clean[:50].strip('_')

def get_date_str(file_path, filename):
    match = re.search(r'(\d{4})(\d{2})(\d{2})', filename)
    if match: return f"{match.group(1)}{match.group(2)}{match.group(3)}"

    match_dash = re.search(r'(\d{4})-(\d{2})-(\d{2})', filename)
    if match_dash: return f"{match_dash.group(1)}{match_dash.group(2)}{match_dash.group(3)}"

    try:
        mtime = os.path.getmtime(file_path)
        return datetime.datetime.fromtimestamp(mtime).strftime("%Y%m%d")
    except:
        return "00000000"

def setup_directories():
    DEST_DIR.mkdir(parents=True, exist_ok=True)
    (DEST_DIR / "00_Master_Index").mkdir(exist_ok=True)
    for cat in CATEGORIES.keys():
        (DEST_DIR / cat).mkdir(exist_ok=True)
    print(f"Created destination layout at {DEST_DIR}")

def gather_files():
    setup_directories()
    master_index_path = DEST_DIR / "00_Master_Index" / "Master_Index.csv"
    seen_hashes = {}
    total_copied = 0
    total_size_bytes = 0

    with open(master_index_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['hash', 'original_path', 'new_path', 'filename', 'ext', 'size_bytes', 'category']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for drive_path in SOURCE_DRIVES:
            if not drive_path.exists():
                print(f"Skipping {drive_path} - Not found. (Is local Colab runtime connected?)")
                continue

            print(f"--------------------------------------------------")
            print(f"Scanning Drive: {drive_path}")

            for root, dirs, files in os.walk(drive_path):
                if DEST_DIR in Path(root).parents or Path(root) == DEST_DIR:
                    continue

                for file in files:
                    file_path = Path(root) / file

                    if not is_ssa_file(file_path):
                        continue

                    try:
                        file_hash = get_file_hash(file_path)
                        size_bytes = file_path.stat().st_size
                        ext = file_path.suffix.lower()

                        if file_hash in seen_hashes:
                            # Skip exact duplicates
                            continue

                        category = classify_file(file)
                        cat_short = FOLDER_MAP[category]
                        short_desc = sanitize_filename(file)
                        date_str = get_date_str(file_path, file)

                        new_filename = f"{cat_short}_{short_desc}_{date_str}{ext}"
                        dest_folder = DEST_DIR / category
                        new_path = dest_folder / new_filename

                        # Handle identical filenames that aren't exact duplicates
                        counter = 1
                        while new_path.exists():
                            new_filename = f"{cat_short}_{short_desc}_{date_str}_{counter}{ext}"
                            new_path = dest_folder / new_filename
                            counter += 1

                        # Copy the file to Google Drive
                        shutil.copy2(file_path, new_path)

                        # Record keeping
                        seen_hashes[file_hash] = new_path
                        total_size_bytes += size_bytes
                        total_copied += 1

                        writer.writerow({
                            'hash': file_hash,
                            'original_path': str(file_path),
                            'new_path': str(new_path),
                            'filename': file,
                            'ext': ext,
                            'size_bytes': size_bytes,
                            'category': category
                        })
                        print(f"Copied: {file} -> {category}/{new_filename}")

                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

    total_gb = total_size_bytes / (1024 ** 3)
    print(f"\n==================================================")
    print(f"DONE! Successfully assembled {total_copied} SSA files.")
    print(f"Total volume copied to Google Drive: {total_gb:.2f} GB")
    print(f"Master Index saved at: {master_index_path}")

gather_files()


In [None]:
import pandas as pd

master_index_path = '/home/johan/GoogleDrive/SSA_Organized_Master/00_Master_Index/Master_Index.csv'

try:
    master_index_df = pd.read_csv(master_index_path)
    print(f"Contents of {master_index_path} (first 10 rows):")
    display(master_index_df.head(10))
except FileNotFoundError:
    print(f"Error: The file {master_index_path} was not found. This might indicate an issue with the DEST_DIR path or the previous run.")
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")

In [None]:
import pandas as pd

master_index_path = '/content/drive/MyDrive/SSA_Organized_Master/00_Master_Index/Master_Index.csv'

try:
    master_index_df = pd.read_csv(master_index_path)
    print(f"Contents of {master_index_path}:")
    display(master_index_df.head())
except FileNotFoundError:
    print(f"Error: The file {master_index_path} was not found. Please ensure the 'gather_files' function ran successfully and created the file.")
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")

In [None]:
import pandas as pd

master_index_path = '/home/johan/GoogleDrive/SSA_Organized_Master/00_Master_Index/Master_Index.csv'

try:
    master_index_df = pd.read_csv(master_index_path)
    print(f"Contents of {master_index_path} (first 10 rows):")
    display(master_index_df.head(10))
except FileNotFoundError:
    print(f"Error: The file {master_index_path} was not found. This might indicate an issue with the DEST_DIR path or the previous run.")
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")