In [8]:
!pip install raphtory




In [19]:
# Import necessary libraries
import pandas as pd
from raphtory import Graph

def preprocess_and_load_data(filepath):
    """
    Load and preprocess call graph data from a CSV file.
    
    This function performs several preprocessing steps on call graph data, including:
    1. Loading specific columns from the CSV file.
    2. Removing rows containing specific unwanted values.
    3. Handling missing values by removing rows with any NaN values.
    4. Normalising the 'timestamp' values by subtracting the minimum timestamp from all timestamp values.
    5. Creates a 'weight' column which referes to the frequency of the same 'um', 'dm', and 'timestamp' values.
    6. Merges the weight column back into the original DataFrame.
    7. Ensures the 'timestamp' column is of type int64. (Otherwise Raphtory would throw the "Source and Target columns must be either u64 or text, Time column must be i64." error)
    
    The columns to be loaded and their respective data types are predefined within the function. The unwanted values that are to be removed are also predefined.
    
    Parameters:
    file_path (str): The path to the CSV file.
    
    Returns:
    pd.DataFrame: A pandas DataFrame containing the preprocessed call graph data.
    
    Note:
    The function assumes that the input CSV file adheres to a specific structure and contains specific columns. If the CSV file differs significantly from this expected structure, the function may not perform correctly.
    """

    # Define the columns that need to be loaded
    columns = ["timestamp", "um", "dm"]

    # Define data types for each column
    data_types = {
        "timestamp": int,
        "um": str,
        "dm": str,
    }

    # Load the data
    call_graph_df = pd.read_csv(filepath, usecols=columns, dtype=data_types)

    # Remove rows with unwanted values & Filter out rows that have any NaN values
    unwanted_values=["UNKNOWN", "UNAVAILABLE"]
    call_graph_df = call_graph_df[~call_graph_df.isin(unwanted_values).any(axis=1)].copy()
    call_graph_df = call_graph_df.dropna()

    # Find the lowest value in column 'timestamp' and subtract it from every value in the column
    call_graph_df['timestamp'] -= call_graph_df['timestamp'].min()

    # Group by 'um', 'dm', and 'timestamp', and count the number of occurrences calling it weight
    edge_weights_df = call_graph_df.groupby(['um', 'dm', 'timestamp']).size().reset_index(name='weight')

    # Merge the counts back into the original DataFrame
    call_graph_df = pd.merge(call_graph_df, edge_weights_df, on=['um', 'dm', 'timestamp'], how='left')

    # Ensure 'timestamp' column is int64
    call_graph_df['timestamp'] = call_graph_df['timestamp'].astype('int64')

    return call_graph_df

In [18]:
filepath = '../Alibaba-CallGraph/CallGraph_1.csv'

call_graph_df = preprocess_and_load_data(filepath)

# Load the data into Raphtory
g = Graph.load_from_pandas(
    edge_df=call_graph_df,
    edge_src="um",
    edge_dst="dm",
    edge_time="timestamp",
    edge_props=["weight"]
)

print(g)

HBox(children=(HTML(value=''), IntProgress(value=0, max=10395251), HTML(value='')))

Graph(number_of_edges=51364, number_of_vertices=16666, number_of_temporal_edges=10395251, earliest_time="0", latest_time="179999")


In [12]:
g.save_to_file("/tmp/saved_graph")
loaded_graph = Graph.load_from_file("/tmp/saved_graph")
print(loaded_graph)


print(call_graph_df.dtypes)
print(call_graph_df.isnull().sum())
print(call_graph_df.isin([np.inf, -np.inf]).sum())

timestamp     int32
um           object
dm           object
weight        int64
dtype: object
timestamp    0
um           0
dm           0
weight       0
dtype: int64
timestamp    0
um           0
dm           0
weight       0
dtype: int64
