In [10]:
# pathway_pipeline.ipynb

import sys

# Remove duplicates from sys.path
sys.path = list(set(sys.path))

# Add the absolute path to the src directory
sys.path.append('/Users/debdeepbanerjee/Desktop/Projects/kdsh/src')

# Verify that the path has been added only once
print("Updated sys.path:", sys.path)

# Print the Python executable being used
print("Python Executable:", sys.executable)

# Import necessary modules
import pathway as pw
from backend.pathway_integration import PathwayIntegration

# Ensure that 'pw.io' contains the 'gdrive' method
print("Available modules in pw.io:", dir(pw.io))

# Adjust the PathwayIntegration to correctly initialize gdrive
class PathwayIntegration:
    def __init__(self, embedder_model="allenai/scibert_scivocab_uncased"):
        # Initialize connectors properly (using the correct class or method for gdrive)
        self.google_drive_connector = pw.io.gdrive.GDriveConnector("https://drive.google.com/drive/folders/1Z8z4craj36ighb8hzUzeM76OOgpUdsKr")
        
        # Set up embedding pipeline (using SciBERT or any other model)
        self.embedder = pipeline("feature-extraction", model=embedder_model, tokenizer=embedder_model)

        # Initialize VectorStoreServer
        self.vector_store_server = VectorStoreServer(
            docs=self.stream_data(),
            embedder=self._embedding_udf,
            parser=self._basic_parser,
        )

    def stream_data(self):
        """Stream papers data using Pathway connectors."""
        return self.google_drive_connector.read()

    def _embedding_udf(self, text: str) -> List[float]:
        """Embed text using the embedding pipeline."""
        embeddings = self.embedder(text)
        return embeddings[0][0]  # Flatten the output

    def _basic_parser(self, content: bytes) -> List[Tuple[str, dict]]:
        """Basic parser to handle file content."""
        text = content.decode("utf-8")
        return [(text, {})]

    def query_statistics(self):
        """Query document statistics."""
        query_table = pw.Table.from_dicts([{"id": 1}])  # Example query schema
        result_table = self.vector_store_server.statistics_query(query_table)
        return result_table

    def query_similar_embeddings(self, query_embedding: List[float]):
        """Query nearest neighbors for an embedding."""
        return self.vector_store_server._graph["knn_index"].query(query_embedding, top_k=3)

    def query_inputs(self, metadata_filter=None, filepath_globpattern=None):
        """Query inputs using metadata filtering."""
        query_table = pw.Table.from_dicts([
            {"id": 1, "metadata_filter": metadata_filter, "filepath_globpattern": filepath_globpattern}
        ])
        result_table = self.vector_store_server.inputs_query(query_table)
        return result_table

# Initialize PathwayIntegration
integration = PathwayIntegration()

# Use the connection to link two papers (replace "Paper A" and "Paper B" with actual paper names)
result = integration.connect_papers("Paper A", "Paper B")

# Print result of the connection
print("Connection Result:", result)

# Example of querying Google Drive data (ensure the path is valid)
try:
    drive_data = integration.google_drive_connector.read()
    print("Drive Data:", drive_data)
except Exception as e:
    print(f"Error accessing Google Drive: {e}")

# Check available methods in Pathway Integration and other debugging information
print("Available methods in PathwayIntegration:", dir(integration))

# Check and display query statistics (just an example)
query_stats = integration.query_statistics()
print("Query Statistics Result:", query_stats)

# Display sample query for similar embeddings (you may replace the query with actual query data)
sample_query_embedding = [0.1, 0.2, 0.3, 0.4, 0.5]  # Replace with actual embedding data
similar_embeddings = integration.query_similar_embeddings(sample_query_embedding)
print("Similar Embeddings Query Result:", similar_embeddings)


Updated sys.path: ['', '/Users/debdeepbanerjee/.pyenv/versions/3.10.0/lib/python3.10/lib-dynload', '/Users/debdeepbanerjee/Desktop/Projects/kdsh/myenv/lib/python3.10/site-packages', '/Users/debdeepbanerjee/.pyenv/versions/3.10.0/lib/python310.zip', '/Users/debdeepbanerjee/.pyenv/versions/3.10.0/lib/python3.10', '/Users/debdeepbanerjee/Desktop/Projects/kdsh/src', '/var/folders/d0/dlqx3vfs4kq4dx9v77jybst40000gn/T/tmppe8_ywia', '/Users/debdeepbanerjee/Desktop/Projects/kdsh/src']
Python Executable: /Users/debdeepbanerjee/Desktop/Projects/kdsh/myenv/bin/python
Available modules in pw.io: ['CsvParserSettings', 'OnChangeCallback', 'OnFinishCallback', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_subscribe', '_utils', 'airbyte', 'bigquery', 'csv', 'debezium', 'deltalake', 'elasticsearch', 'fs', 'gdrive', 'http', 'iceberg', 'jsonlines', 'kafka', 'logstash', 'minio', 'mongodb', 'nats', 'null', 'plaintext', 'post

NameError: name 'List' is not defined

In [11]:
# Print available methods and attributes in pw.io.gdrive
print(dir(pw.io.gdrive))




In [None]:
import pw.io.gdrive

object_id = "1Z8z4craj36ighb8hzUzeM76OOgpUdsKr"

# Path to your Google Service Account credentials JSON file
credentials_path = "/path/to/your/service_account_credentials.json"

try:
    # Pass the credentials file to the read method
    drive_data = pw.io.gdrive.read(object_id, service_user_credentials_file=credentials_path)
    print("Drive Data:", drive_data)
except Exception as e:
    print(f"Error reading from Google Drive: {e}")


Error reading from Google Drive: read() missing 1 required keyword-only argument: 'service_user_credentials_file'


In [13]:
# Inspect the GDriveFile class
print(dir(pw.io.gdrive.GDriveFile))  # Check methods and attributes of GDriveFile class


['__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__name__', '__ne__', '__new__', '__or__', '__qualname__', '__reduce__', '__reduce_ex__', '__repr__', '__ror__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__supertype__', '__weakref__']
