In [None]:
import numpy as np
from shapely import Point, LineString
class Statistics(Shapefile):
    def __init__(self, datasetNumber, inputFolder, outputFolder):
        super().__init__(datasetNumber, inputFolder, outputFolder)
        self.node_distances = {}
    

    def calculate_perpendicular_distance(self, zone_radius, zone_radius_increment, batch_size=100, num_workers=1):
        

        # Create spatial index for pipes
        pipe_sindex = self.gdfPipes.sindex

        # Batch processing
        for batch_start in range(0, len(self.gdfNodes), batch_size):
            batch_end = batch_start + batch_size
            batch_nodes = self.gdfNodes[batch_start:batch_end]
            
            # Parallel processing
            if num_workers > 1:
                import multiprocessing
                with multiprocessing.Pool(num_workers) as pool:
                    results = pool.starmap(
                        self.process_batch,
                        [(batch_nodes, zone_radius, zone_radius_increment, pipe_sindex, idx) for idx in range(len(batch_nodes))]
                    )
            else:
                results = [
                    self.process_batch(batch_nodes, zone_radius, zone_radius_increment, pipe_sindex, idx)
                    for idx in range(len(batch_nodes))
                ]

            # Merge results
            for node_id, min_distance in results:
                self.node_distances[node_id] = min_distance

        return True

    def process_batch(self, batch_nodes, zone_radius, zone_radius_increment, pipe_sindex, idx):
        default_diameter = 200
        node_id = batch_nodes.iloc[idx]['id']
        node_geometry = batch_nodes.iloc[idx]['geometry']
        if node_geometry is None:
            return node_id, None
        
    

        distances = []

        
        while True:
            minx, miny, maxx, maxy = node_geometry.bounds
            bbox = (minx - zone_radius, miny - zone_radius, maxx + zone_radius, maxy + zone_radius)
            possible_matches_index = list(pipe_sindex.intersection(bbox))
            possible_matches = self.gdfPipes.iloc[possible_matches_index]

            '''print(minx, miny, maxx, maxy)'''

            '''print(f'The possible matches {possible_matches}')'''

            if not possible_matches.empty:
                for _, pipe_row in possible_matches.iterrows():
                    #distance = node_geometry.distance(pipe_row['geometry'])
                    distance = self.perpendicular_distance(node_geometry, pipe_row['geometry'])
                    diametre_cm = pipe_row['diametre']
                    if diametre_cm is None or np.isnan(diametre_cm) or diametre_cm == 0:
                        diametre_cm = default_diameter
                    diametre_m = diametre_cm/100.0

                    id_pipe = pipe_row['id']

                    '''print(f'the distance between the node id {node_id} and the pipe {id_pipe} is {distance}')
                    print(f'the pipe id {id_pipe} has the diameter {diametre_m}')'''
                
                    distances.append(distance)
                    '''if min_distance <= zone_radius:
                        break'''

                '''if min_distance <= zone_radius:
                    break'''
                
            if distances:
                break
            else:
                zone_radius += zone_radius_increment


        min_distance = min(distances)
        '''if min_distance >= diametre_m / 2.0:
            return node_id, min_distance'''
        if min_distance > 0:
            return node_id, min_distance
        return node_id, None
    
    def perpendicular_distance(self, point_geom, line_geom):
        # Ensure the geometries are of the correct type
        if not isinstance(point_geom, Point) or not isinstance(line_geom, LineString):
            return 0
            #raise ValueError("Input geometries must be a Point and a LineString.")

        # Project the point onto the line
        projected_point = line_geom.interpolate(line_geom.project(point_geom))

        # Calculate the distance between the original point and the projected point
        distance = point_geom.distance(projected_point)

        return distance

    def summarize_distances_by_category(self):
        # Create a dictionary to store distances by category
        distances_by_category = {}

        liste = []



        # Iterate through nodes and their distances
        for node_id, distance in self.node_distances.items():
            
            # Skip nodes with no distance or distance <= 0
            if distance is None:
                continue
            liste.append(node_id)
            # Get the source_1 attribute for the current node
            source_1 = self.gdfNodes[self.gdfNodes['id'] == node_id]['source_1'].values[0]
            
            # Add the distance to the corresponding category in the dictionary
            if source_1 not in distances_by_category:
                distances_by_category[source_1] = []
            distances_by_category[source_1].append(distance)

        
        print(liste)


        # Prepare the table headers for category statistics
        category_table_headers = ["Category", "Count", "Mean Distance", "Median Distance", "Min Distance", "Max Distance", "Std Deviation", "25th Percentile", "75th Percentile", "85th Percentile", "100th Percentile", "Geometric Mean"]

        # Prepare the table rows for category statistics
        category_table_rows = []
        for category, category_distances in distances_by_category.items():
            if category_distances:
                count = len(category_distances)
                mean_distance = np.mean(category_distances)
                median_distance = np.median(category_distances)
                min_distance = np.min(category_distances)
                max_distance = np.max(category_distances)
                std_deviation = np.std(category_distances)
                q25 = np.percentile(category_distances, 25)
                q75 = np.percentile(category_distances, 75)
                q85 = np.percentile(category_distances, 85)
                q100 = np.percentile(category_distances, 100)
                geom_mean = np.exp(np.mean(np.log(category_distances)))

                category_table_rows.append([
                    category, count, mean_distance, median_distance, min_distance,
                    max_distance, std_deviation, q25, q75, q85, q100, geom_mean
                ])

        # Prepare the table headers for overall statistics
        overall_table_headers = ["Overall", "Mean Distance", "Median Distance", "Min Distance", "Max Distance", "Std Deviation", "25th Percentile", "75th Percentile",  "85th Percentile", "100th Percentile" , "Geometric Mean"]

        # Prepare the table rows for overall statistics
        overall_table_rows = []
        all_distances = [distance for distance in self.node_distances.values() if distance is not None and distance > 0]
        mean_distance = np.mean(all_distances)
        median_distance = np.median(all_distances)
        min_distance = np.min(all_distances)
        max_distance = np.max(all_distances)
        std_deviation = np.std(all_distances)
        q25 = np.percentile(all_distances, 25)
        q75 = np.percentile(all_distances, 75)
        q85 = np.percentile(all_distances, 85)
        q100 = np.percentile(all_distances, 100)
        geom_mean = np.exp(np.mean(np.log(all_distances)))

        overall_table_rows.append([
            "All", mean_distance, median_distance, min_distance,
            max_distance, std_deviation, q25, q75, q85, q100, geom_mean
        ])

        # Print the tables using tabulate
        print("Category Statistics:")
        print(tabulate(category_table_rows, headers=category_table_headers, tablefmt="grid"))
        print("\nOverall Statistics:")
        print(tabulate(overall_table_rows, headers=overall_table_headers, tablefmt="grid"))

    def count_pipes_by_diameter(self):
        # Count pipes by diameter value, including 0 and null values
        self.diameter_counts = self.gdfPipes['diametre'].value_counts(dropna=False)

        # Prepare the table headers
        table_headers = ["Diameter", "Count"]

        # Prepare the table rows
        table_rows = []
        for diameter, count in self.diameter_counts.items():
            table_rows.append([diameter, count])

        # Print the table using tabulate
        print(tabulate(table_rows, headers=table_headers, tablefmt="grid"))

'''    def get_min_non_null_non_zero_diameter(self):
        # Filter out null and zero values, and get the minimum diameter
        self.gdfPipes.head()
        non_null_non_zero_diameters = self.gdfPipes['diametre'][
            (self.gdfPipes['diametre'].notnull()) & (self.gdfPipes['diametre'] != 0)
        ]
        if not non_null_non_zero_diameters.empty:
            self.min_diameter = non_null_non_zero_diameters.min()
            return self.min_diameter
        else:
            return None'''