In [1]:
import faiss
import os
import json
from sklearn.preprocessing import normalize
import hdbscan
import numpy as np

In [2]:

IMG_TXT_PATH = ".././models/img_to_text/"
SUMMARIZER_PATH = ".././models/summarizer/"
# Paths where you previously saved the BLIP models & processors:
LOCAL_CAPTION_DIR = ".././models/img_caption"
LOCAL_RETRIEVAL_DIR    = ".././models/embedding"

FILE_STORING_PATH ='.././files_DB'
ILE_SYSTEM_PATH  = '.././File_System_Simulation'
VECTOR_DB_PATH = '.././vectorDB'
INDEX_FILE = os.path.join(VECTOR_DB_PATH, "file_index.faiss")
METADATA_FILE = os.path.join(VECTOR_DB_PATH, "index_metadata.json")
KEYWORD_EXTRACTION_MODEL_PATH = "../models/kword_extraction"
NEW_ROOT = "../demo"

In [3]:
from  keybert import KeyBERT
from sentence_transformers import SentenceTransformer


index = None
metadata_list = []

def load_or_create_vector_db():
    global index, metadata_list
    os.makedirs(VECTOR_DB_PATH, exist_ok=True)
    
    if os.path.exists(INDEX_FILE) and os.path.exists(METADATA_FILE):
        print("Loading existing vector database...")
        try:
            index = faiss.read_index(INDEX_FILE)
            with open(METADATA_FILE, 'r') as f:
                metadata_list = json.load(f)                
            # Add consistency check
            if index.ntotal != len(metadata_list):
                print(f"Warning: Index count ({index.ntotal}) doesn't match metadata count ({len(metadata_list)}). Recreating index.")            
            else:
                print(f"Loaded vector DB with {index.ntotal} entries")
        except Exception as e:
            print(f"Error loading vector DB: {e}")
    else:
        print("Creating new vector database...")
    return index, metadata_list

def retrieve_all_embeddings():
    global index
    if index.ntotal == 0:
        return np.array([]), []
    # Retrieve all embeddings
    all_embeddings = index.reconstruct_n(0, index.ntotal)
    
    return all_embeddings
def save_vector_db(index, metadata_list):
    # 1) Write FAISS index
    faiss.write_index(index, INDEX_FILE)

    # 2) Dump metadata_list to JSON
    with open(METADATA_FILE, "w", encoding="utf-8") as f:
        json.dump(metadata_list, f, indent=2, ensure_ascii=False)

    print(f"✅ Saved index to {INDEX_FILE} and metadata ({len(metadata_list)} entries) to {METADATA_FILE}.")



In [None]:
import umap.umap_ as umap
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

def restructure_file_system(metadata_list, file_system_path):
    for meta in metadata_list:
        src_fullpath = meta["storage_path"]
        new_relpath = meta["file_path"]            # e.g. "swimming/swimming.txt"
        dest_fullpath = os.path.join(file_system_path, new_relpath)

        # 1) Ensure destination directory exists
        dest_folder = os.path.dirname(dest_fullpath)
        os.makedirs(dest_folder, exist_ok=True)

        # 2) Move the file
        if os.path.exists(src_fullpath):
            shutil.copy2(src_fullpath, dest_fullpath)
        else:
            print(f"[WARNING] Source not found: {src_fullpath}")


def cluster_files():
    load_or_create_vector_db()
    all_embeddings = retrieve_all_embeddings()
    um = umap.UMAP(n_components=4,min_dist=0, metric='cosine',  random_state=42)
    reduced = um.fit_transform(all_embeddings)  # shape (6, 2)
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=2,     # minimum cluster size
        min_samples=1,                  # aggressive clusterin                
        cluster_selection_method='leaf' # more fine-grained clusters
    )
    labels = clusterer.fit_predict(reduced)
    # get file_names per cluster: -> groupd {cluster_number: list_file_names}
    grouped={}
    for i,l in enumerate(labels):
        if l in grouped:
            grouped[l]=grouped[l]+','+metadata_list[i]['file_name']
        else:
            grouped[l]=metadata_list[i]['file_name']
    docs=np.array(list(grouped.values()))
    local_embedding_model = SentenceTransformer(KEYWORD_EXTRACTION_MODEL_PATH)
    # Initialize KeyBERT with the locally loaded embedding model
    kw_model = KeyBERT(model=local_embedding_model)
    keyphrases = kw_model.extract_keywords(docs, top_n=1,)
    title_dict={}
    for ky, kphrase in zip(grouped.keys(),keyphrases):
        title_dict[ky]=kphrase[0][0]
    for label,mt_data in zip(labels,metadata_list):
        mt_data['file_path']=f"{title_dict[label]}/{mt_data['file_path'].split('/')[-1]}"
        # Call the mover using the updated_meta from before:
    restructure_file_system(metadata_list, NEW_ROOT)
    save_vector_db(index, metadata_list)



In [4]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import umap.umap_ as umap
load_or_create_vector_db()
all_embeddings = retrieve_all_embeddings()
um = umap.UMAP(n_components=4,min_dist=0, metric='cosine',  random_state=42)
reduced = um.fit_transform(all_embeddings)  # shape (6, 2)

Loading existing vector database...
Loaded vector DB with 6 entries


  warn(
  warn(


In [5]:
reduced

array([[ -4.0671787, -15.357155 ,  -8.457    ,   6.036557 ],
       [ -4.0289536, -15.324044 ,  -7.647319 ,   6.9144893],
       [ -3.8839467, -16.081715 ,  -7.925273 ,   5.8173084],
       [ -3.4488456, -15.701628 ,  -7.9389677,   6.612119 ],
       [ -4.760903 , -15.495141 ,  -8.110195 ,   6.3716593],
       [ -4.074336 , -15.563984 ,  -7.366825 ,   5.7913136]],
      dtype=float32)

In [6]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=2,     # minimum cluster size
    min_samples=1,                  # aggressive clusterin                
    cluster_selection_method='leaf' # more fine-grained clusters
)
labels = clusterer.fit_predict(reduced)

In [7]:
labels

array([1, 0, 2, 0, 1, 2])

In [8]:
grouped={}
for i,l in enumerate(labels):
    if l in grouped:
        grouped[l]=grouped[l]+','+metadata_list[i]['file_name']
    else:
        grouped[l]=metadata_list[i]['file_name']

In [9]:
grouped

{1: 'swimming.txt,sea.png',
 0: 'football.png,football.txt',
 2: 'statistical_testing.txt,math.txt'}

In [11]:
grouped={}
for i,l in enumerate(labels):
    if l in grouped:
        grouped[l]=grouped[l]+','+metadata_list[i]['content']
    else:
        grouped[l]=metadata_list[i]['content']

In [12]:
grouped

{1: 'Swimming is both a recreational activity and a competitive sport that promotes fitness, endurance, and relaxation. It involves moving through water using various strokes, such as freestyle, breaststroke, backstroke, and butterfly. As a low-impact exercise, swimming is especially beneficial for rehabilitation and,a beach with waves and clouds',
 0: "two young boys playing soccer on a field,Football, known as soccer in some regions, is the world's most popular sport, captivating billions of fans across continents. It is a game of skill, strategy, and teamwork, where two teams compete to score goals by maneuvering the ball with their feet. From local neighborhood matches",
 2: 'Statistical testing is a crucial method used to analyze data and make informed conclusions about populations based on samples. It involves applying mathematical techniques to determine whether observed patterns or relationships are significant or merely due to chance. Common statistical tests, such as t-tests,

In [13]:
docs=np.array(list(grouped.values()))
docs


array(['Swimming is both a recreational activity and a competitive sport that promotes fitness, endurance, and relaxation. It involves moving through water using various strokes, such as freestyle, breaststroke, backstroke, and butterfly. As a low-impact exercise, swimming is especially beneficial for rehabilitation and,a beach with waves and clouds',
       "two young boys playing soccer on a field,Football, known as soccer in some regions, is the world's most popular sport, captivating billions of fans across continents. It is a game of skill, strategy, and teamwork, where two teams compete to score goals by maneuvering the ball with their feet. From local neighborhood matches",
       'Statistical testing is a crucial method used to analyze data and make informed conclusions about populations based on samples. It involves applying mathematical techniques to determine whether observed patterns or relationships are significant or merely due to chance. Common statistical tests, such as

In [14]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
# Load the embedding model from the local directory
local_embedding_model = SentenceTransformer(KEYWORD_EXTRACTION_MODEL_PATH)
# Initialize KeyBERT with the locally loaded embedding model
kw_model = KeyBERT(model=local_embedding_model)


In [15]:
keyphrases = kw_model.extract_keywords(docs, top_n=1,)
title_dict={}
for ky, kphrase in zip(grouped.keys(),keyphrases):
    title_dict[ky]=kphrase[0][0]
title_dict

{1: 'swimming', 0: 'soccer', 2: 'statistical'}

In [18]:
for label,mt_data in zip(labels,metadata_list):
    mt_data['file_path']=f"{title_dict[label]}/{mt_data['file_path'].split('/')[-1]}"
metadata_list

[{'file_id': 'swimming_de18782b',
  'file_name': 'swimming.txt',
  'file_path': 'swimming/swimming.txt',
  'file_type': 'text',
  'content': 'Swimming is both a recreational activity and a competitive sport that promotes fitness, endurance, and relaxation. It involves moving through water using various strokes, such as freestyle, breaststroke, backstroke, and butterfly. As a low-impact exercise, swimming is especially beneficial for rehabilitation and',
  'storage_path': '/teamspace/studios/this_studio/AI-Powered-File-System/src/../files_DB/swimming_de18782b.txt'},
 {'file_id': 'football_866089ce',
  'file_name': 'football.png',
  'file_path': 'soccer/football.png',
  'file_type': 'image',
  'content': 'two young boys playing soccer on a field',
  'storage_path': '/teamspace/studios/this_studio/AI-Powered-File-System/src/../files_DB/football_866089ce.png'},
 {'file_id': 'statistical_testing_5e5a9bd1',
  'file_name': 'statistical_testing.txt',
  'file_path': 'statistical/statistical_tes

In [19]:
metadata_list

[{'file_id': 'swimming_de18782b',
  'file_name': 'swimming.txt',
  'file_path': 'swimming/swimming.txt',
  'file_type': 'text',
  'content': 'Swimming is both a recreational activity and a competitive sport that promotes fitness, endurance, and relaxation. It involves moving through water using various strokes, such as freestyle, breaststroke, backstroke, and butterfly. As a low-impact exercise, swimming is especially beneficial for rehabilitation and',
  'storage_path': '/teamspace/studios/this_studio/AI-Powered-File-System/src/../files_DB/swimming_de18782b.txt'},
 {'file_id': 'football_866089ce',
  'file_name': 'football.png',
  'file_path': 'soccer/football.png',
  'file_type': 'image',
  'content': 'two young boys playing soccer on a field',
  'storage_path': '/teamspace/studios/this_studio/AI-Powered-File-System/src/../files_DB/football_866089ce.png'},
 {'file_id': 'statistical_testing_5e5a9bd1',
  'file_name': 'statistical_testing.txt',
  'file_path': 'statistical/statistical_tes

In [20]:
!ls ../files_DB

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


football_866089ce.png  math_8a30cdef.txt  statistical_testing_5e5a9bd1.txt
football_c0f32165.txt  sea_c3e61248.png   swimming_de18782b.txt


In [23]:
def restructure_file_system(metadata_list, file_system_path):
    for meta in metadata_list:
        src_fullpath = meta["storage_path"]
        new_relpath = meta["file_path"]            # e.g. "swimming/swimming.txt"
        dest_fullpath = os.path.join(file_system_path, new_relpath)

        # 1) Ensure destination directory exists
        dest_folder = os.path.dirname(dest_fullpath)
        os.makedirs(dest_folder, exist_ok=True)

        # 2) Move the file
        if os.path.exists(src_fullpath):
            shutil.move(src_fullpath, dest_fullpath)
        else:
            print(f"[WARNING] Source not found: {src_fullpath}")


In [24]:
# Suppose you want your new directory root to be "/data/clustered_files"
NEW_ROOT = "../demo"

# Call the mover using the updated_meta from before:
restructure_file_system(metadata_list, NEW_ROOT)
