In [3]:
import pandas as pd
import networkx as nx
import os

# List of dataset file paths
file_paths = [
    "Datasets/aishihik_lake.csv",
    "Datasets/cold_lake.csv",
    "Datasets/lake_of_the_woods.csv",
    "Datasets/mcgregor_river.csv",
    "Datasets/parsnip_river.csv",
    "Datasets/sbay_lake_huron.csv",
    "Datasets/smallwood_reservoir.csv",
]

# Ensure the output directory exists
output_dir = "datacleaning"
os.makedirs(output_dir, exist_ok=True)

# Loop through each dataset and preprocess
for file_path in file_paths:
    try:
        # Load the dataset
        data = pd.read_csv(file_path)
        print(f"Dataset '{file_path}' loaded successfully.")
        
        # Check columns in the dataset
        print(f"Columns in '{file_path}': {data.columns.tolist()}")
        
        # Step 1: Handle missing or incorrect 'Parasite genus' column
        # If 'Unnamed: 0' is the column holding genus names, use it as the source
        if 'Unnamed: 0' not in data.columns:
            print(f"Error: 'Unnamed: 0' column not found in {file_path}. Skipping this dataset.")
            continue
        
        # Step 2: Restructure data into long format (using the first column for the 'source')
        data_long = data.melt(id_vars=["Unnamed: 0"], 
                              var_name="target", 
                              value_name="weight")
        
        # Rename columns for clarity
        data_long = data_long.rename(columns={"Unnamed: 0": "source"})
        
        # Step 3: Drop rows with missing values
        data_long = data_long.dropna()
        print(f"Rows after removing missing values: {len(data_long)}")
        
        # Step 4: Drop duplicate edges
        data_long = data_long.drop_duplicates(subset=["source", "target"])
        print(f"Rows after removing duplicate edges: {len(data_long)}")
        
        # Step 5: Create a directed graph
        G = nx.from_pandas_edgelist(data_long, source="source", target="target", create_using=nx.DiGraph())
        print(f"Graph created for '{file_path}' with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
        
        # Step 6: Calculate clustering coefficient
        # If the graph is directed, we calculate the clustering coefficient for the directed graph
        if isinstance(G, nx.DiGraph):
            clustering_coeffs = nx.clustering(G.to_undirected())  # Convert to undirected for clustering
        else:
            clustering_coeffs = nx.clustering(G)  # For undirected graphs
        
        # Calculate the average clustering coefficient
        avg_clustering_coeff = sum(clustering_coeffs.values()) / len(clustering_coeffs)
        print(f"Clustering coefficient for '{file_path}': {avg_clustering_coeff}")
        
        # Step 7: Save the preprocessed data
        output_path = os.path.join(output_dir, f"{file_path.split('/')[-1].replace('.csv', '_preprocessed.csv')}")
        data_long.to_csv(output_path, index=False)
        print(f"Preprocessed data saved to {output_path}")
    
    except Exception as e:
        print(f"An error occurred while processing '{file_path}': {e}")


Dataset 'Datasets/aishihik_lake.csv' loaded successfully.
Columns in 'Datasets/aishihik_lake.csv': ['Unnamed: 0', 'Unnamed: 1', 'Parasite genus', 'Anonchohaptor', 'Dactylogyrus', 'Discocotyle', 'Tetraonchus', 'Tetraonchus.1', 'Tetraonchus.2', 'Allocreadium', 'Crepidostomum', 'Diplostomum', 'Heterophyid', 'Neascus', 'Tetracotyle', 'Cyathocephalus', 'Diphyllobothrium', 'Eubothrium', 'Glaridacris', 'Proteocephalus', 'Schistocephalus', 'Triaenophorus', 'Triaenophorus.1', 'Capillaria', 'Cystidicola', 'Raphidascaris', 'Raphidascaris.1', 'Neoechinorhynchus', 'Piscicola', 'Salmincola', 'Salmincola.1', 'Salmincola.2']
Rows after removing missing values: 248
Rows after removing duplicate edges: 248
Graph created for 'Datasets/aishihik_lake.csv' with 39 nodes and 248 edges.
Clustering coefficient for 'Datasets/aishihik_lake.csv': 0.0
Preprocessed data saved to datacleaning/aishihik_lake_preprocessed.csv
Dataset 'Datasets/cold_lake.csv' loaded successfully.
Columns in 'Datasets/cold_lake.csv': ['U

In [4]:
import pandas as pd
import networkx as nx
import os

# List of dataset file paths
file_paths = [
    "Datasets/AkatoreA.csv",
    "Datasets/AkatoreB.csv",
    "Datasets/Venlaw.csv",
]

# Ensure the output directory exists
output_dir = "datacleaning"
os.makedirs(output_dir, exist_ok=True)

# Loop through each dataset and preprocess
for file_path in file_paths:
    try:
        # Load the dataset
        data = pd.read_csv(file_path)
        print(f"Dataset '{file_path}' loaded successfully.")
        
        # Check columns in the dataset
        print(f"Columns in '{file_path}': {data.columns.tolist()}")
        
        # Step 1: Handle missing or incorrect 'Parasite genus' column
        # If 'Unnamed: 0' is the column holding genus names, use it as the source
        if 'Unnamed: 0' not in data.columns:
            print(f"Error: 'Unnamed: 0' column not found in {file_path}. Skipping this dataset.")
            continue
        
        # Step 2: Restructure data into long format (using the first column for the 'source')
        data_long = data.melt(id_vars=["Unnamed: 0"], 
                              var_name="target", 
                              value_name="weight")
        
        # Rename columns for clarity
        data_long = data_long.rename(columns={"Unnamed: 0": "source"})
        
        # Step 3: Drop rows with missing values
        data_long = data_long.dropna()
        print(f"Rows after removing missing values: {len(data_long)}")
        
        # Step 4: Drop duplicate edges
        data_long = data_long.drop_duplicates(subset=["source", "target"])
        print(f"Rows after removing duplicate edges: {len(data_long)}")
        
        # Step 5: Create a directed graph
        G = nx.from_pandas_edgelist(data_long, source="source", target="target", create_using=nx.DiGraph())
        print(f"Graph created for '{file_path}' with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
        
        # Step 6: Calculate clustering coefficient
        # If the graph is directed, we calculate the clustering coefficient for the directed graph
        if isinstance(G, nx.DiGraph):
            clustering_coeffs = nx.clustering(G.to_undirected())  # Convert to undirected for clustering
        else:
            clustering_coeffs = nx.clustering(G)  # For undirected graphs
        
        # Calculate the average clustering coefficient
        avg_clustering_coeff = sum(clustering_coeffs.values()) / len(clustering_coeffs)
        print(f"Clustering coefficient for '{file_path}': {avg_clustering_coeff}")
        
        # Step 7: Save the preprocessed data
        output_path = os.path.join(output_dir, f"{file_path.split('/')[-1].replace('.csv', '_preprocessed.csv')}")
        data_long.to_csv(output_path, index=False)
        print(f"Preprocessed data saved to {output_path}")
    
    except Exception as e:
        print(f"An error occurred while processing '{file_path}': {e}")

Dataset 'Datasets/AkatoreA.csv' loaded successfully.
Columns in 'Datasets/AkatoreA.csv': ['Unnamed: 0', 'Unidentified detritus', 'Terrestrial invertebrates', 'Plant materials', 'Meiofauna', 'Achnanthes inflata', 'Achnanthes lanceolata', 'Achnanthes linearis', 'Achnanthes minutissima', 'Ankitodesmus sp.', 'Batrachospermum', 'Blue-green algae', 'Calothrix', 'Cocconeis placentula', 'Cymbella kappi', 'Cymbella kappii', 'Cymbella mulleri', 'Diatoma heimale', 'Epithemia sorex', 'Eunotia pectinalis', 'Fragilaria vaucheriae', 'Frustulia rhomboides', 'Gomphoneis herculeana', 'Gomphonema accuminatum', 'Gomphonema angustatum', 'Gomphonema intricatum', 'Gomphonema parvulum', 'Gomphonema sp. III', 'Gomphonema sp. unk', 'Gomphonema truncatum', 'Green algae', 'Gyrosigma', 'Melosira varians', 'Navicula avenacea', 'Navicula rhynocephala', 'Nitzschia dissipata', 'Nitzschia dubia', 'Nitzschia linearis', 'Pinnularia spp.', 'Rhoicosphenia curvata', 'Staurostratum', 'Surirella elegans', 'Synedra ulna', 'Tab