In [None]:
%pip install raphtory

In [None]:
"""
Dead Code - use main.rs file to preprocess the data instead
"""

# Import necessary libraries
import pandas as pd

def preprocess_and_load_data(filepath):
    """
    Load and preprocess call graph data from a CSV file.
    
    This function performs several preprocessing steps on call graph data, including:
    1. Loading specific columns from the CSV file.
    2. Removing rows containing specific unwanted values.
    3. Handling missing values by removing rows with any NaN values.
    4. Ensures the 'timestamp' column is of type int64. (Otherwise Raphtory would throw the "Source and Target columns must be either u64 or text, Time column must be i64." error)
    
    The columns to be loaded and their respective data types are predefined within the function. The unwanted values that are to be removed are also predefined.
    
    Parameters:
    file_path (str): The path to the CSV file.
    
    Returns:
    pd.DataFrame: A pandas DataFrame containing the preprocessed call graph data.
    
    Note:
    The function assumes that the input CSV file adheres to a specific structure and contains specific columns. If the CSV file differs significantly from this expected structure, the function may not perform correctly.
    """

    # Define the columns that need to be loaded
    columns = ["timestamp", "um", "dm"]

    # Define data types for each column
    data_types = {
        "timestamp": int,
        "um": str,
        "dm": str,
    }

    # Load the data
    call_graph_df = pd.read_csv(filepath, usecols=columns, dtype=data_types)

    # Remove rows with unwanted values & Filter out rows that have any NaN values
    unwanted_values=["UNKNOWN", "UNAVAILABLE"]
    call_graph_df = call_graph_df[~call_graph_df.isin(unwanted_values).any(axis=1)].copy()
    call_graph_df = call_graph_df.dropna()

    # Ensure 'timestamp' column is int64
    call_graph_df['timestamp'] = call_graph_df['timestamp'].astype('int64')

    return call_graph_df

In [None]:
"""
Dead Code - use the cell in code.ipynb file to load the data instead
"""

import os
import pandas as pd
from raphtory import Graph

base_filepath = 'c:/Users/mf479/Desktop/Alibaba-CallGraph/CallGraph_{}.csv'
start_file_number = 0
end_file_number = 5

# Initialise an empty Raphtory graph
g = Graph()

for file_number in range(start_file_number, end_file_number + 1):
    filepath = base_filepath.format(file_number)
    if os.path.exists(filepath):
        # Preprocess the data and load into a DataFrame
        call_graph_df = preprocess_and_load_data(filepath)
        
        # Load the data from the DataFrame into the Raphtory graph
        g.load_edges_from_pandas(
            df=call_graph_df,
            src="um",
            dst="dm",
            time="timestamp",
        )
    else:
        print(f"File does not exist: {filepath}")

print(g)