In [None]:
import pandas as pd
from scipy.spatial import KDTree

In [None]:
method = "leiden"
version = "v4"
old_data_path = f"../data/clear results/{method}/{version}/{method}.csv"
res_path = f"../data/clear results/{method}/{version}/{method}_abs_centroid.csv"

In [None]:
second_path = "../data/result/subway_passangers_traffic.csv"
second_weight_name = "Пассажирооборот"
second_type_of_weight = 2

In [None]:
old_data = pd.read_csv(old_data_path)
if "fid" in old_data.columns:
    old_data = old_data.drop("fid", axis=1)
if "id" in old_data.columns:
    old_data = old_data.set_index("id")
old_data

In [None]:
# old_data = old_data.drop(old_data.loc[old_data["type_of_weight"].isin([10, 11])].index)
# old_data

In [None]:
residents = pd.read_csv("../data/result/Residents.csv", index_col=0)
residents

In [None]:
plan_res = pd.read_csv(second_path)
plan_res

In [None]:
def geo_str_to_list(str_list):
    return [[float(c) for c in coor[1:-1].split(",")] for coor in str_list]

def get_coordinates_values(data):
    if "Геопозиция" in data.columns:
        return geo_str_to_list(data["Геопозиция"].values)
    else:
        return data[['latitude', 'longitude']].values.tolist()
    
def apply_geo_class(graph, points, apply_name_column, index_apply_in_graph):
    graph_coordinates = get_coordinates_values(graph)
    points_coordinates = get_coordinates_values(points)

    points_with_class = points.copy()
    points_with_class[apply_name_column] = 0

    kdtree = KDTree(graph_coordinates)

    for i in range(len(points_coordinates)):
        query_point = points_coordinates[i]

        distance, index = kdtree.query(query_point)

        points_with_class.loc[points_with_class.index[i], apply_name_column] = graph.iloc[index, index_apply_in_graph]
    
    return points_with_class

def format(
        data, 
        weight_name:str,
        class_name:str="class", 
        geoposition_name:str="Геопозиция", 
        normalize:bool=True
    ) -> None:

    classes = data[class_name].unique()

    max_in_class = {}
    for cl in classes:
        if normalize:
            max_value = data.loc[data[class_name] == cl][weight_name].max()
        else:
            max_value = 1
        max_in_class.update({cl: max_value})

    new_data = pd.DataFrame({"id":[], "longitude": [], "latitude":[], "modularity_class":[], "weight":[]})
    new_data = new_data.astype({"id": int, "modularity_class": int, "weight": float})

    for i in range(data.shape[0]-1, -1, -1):
        modularity_class = int(data.iloc[i][class_name])
        weight = float(data.iloc[i][weight_name])/max_in_class.get(modularity_class)
        latitude, longitude = data.iloc[i][geoposition_name][1:-1].split(",")
        row = [int(data.iloc[i].name), float(longitude), float(latitude), modularity_class, weight]
        new_data = pd.concat([pd.DataFrame([row], columns=new_data.columns), new_data], ignore_index=True)

    new_data = new_data.set_index("id")
    return new_data

def normalize(data, class_name, weight_name):
    classes = data[class_name].unique()

    max_in_class = {}
    for cl in classes:
        max_value = data.loc[data[class_name] == cl][weight_name].max()
        max_in_class.update({cl: max_value})
    
    for i in range(data.shape[0]):
        data.loc[data.index[i], weight_name] = data.loc[data.index[i], weight_name]/max_in_class.get(data.loc[data.index[i], class_name])
        
    return data

def centroid(
        data:pd.DataFrame,
        class_name:str = "modularity_class", 
        weight_name:str="weight"
    ):

    classes = data[class_name].unique()

    centroid = pd.DataFrame(columns=data.columns)

    for cl in classes:
        cluster = data.loc[data[class_name] == cl]
        lon = cluster["longitude"].sum()/cluster["longitude"].count()
        lat = cluster["latitude"].sum()/cluster["latitude"].count()
        # cluster_without_metro = cluster.loc[cluster["type_of_weight"] != 2]
        weight = cluster[weight_name].sum()
        row = [
               lon,
               lat,
               cl,
               weight,
               10]
        lon_weighted = (cluster["longitude"]*cluster[weight_name]).sum()/cluster[weight_name].sum()
        lat_weighted = (cluster["latitude"]*cluster[weight_name]).sum()/cluster[weight_name].sum()
        # data_weighted = cluster[weight_name].sum()
        row_weighted = [
               lon_weighted,
               lat_weighted,
               cl,
               weight,
               11]

        centroid = pd.concat([pd.DataFrame([row], columns=centroid.columns), centroid], ignore_index=True)
        centroid = pd.concat([pd.DataFrame([row_weighted], columns=centroid.columns), centroid], ignore_index=True)

    data_with_centroids = pd.concat([data, centroid], ignore_index=True)
    data_with_centroids.index.name = "id"

    return data_with_centroids

In [None]:
for i in range(old_data.shape[0]):
    old_data.loc[old_data.index[i], "weight"] = residents.loc[old_data.index[i], "Кол-во жильцов"]
    
old_data

In [None]:
plan_res_class = apply_geo_class(old_data, plan_res, "class", -3)
plan_res_class

In [None]:
format_plan_res = format(plan_res_class, second_weight_name, normalize=False)
format_plan_res["type_of_weight"] = second_type_of_weight

concat_data = pd.concat([old_data, format_plan_res], ignore_index=True)

norm_concat_data = normalize(concat_data, "modularity_class", "weight")

centroid_residents_plan = centroid(norm_concat_data)

centroid_residents_plan

In [None]:
centroid_residents_plan.to_csv(res_path)

For Subways

In [None]:
# format_plan_res = format(plan_res_class, second_weight_name, normalize=True)
# format_plan_res["type_of_weight"] = second_type_of_weight

# # norm_old_data = normalize(old_data, "modularity_class", "weight")

# concat_data = pd.concat([old_data, format_plan_res], ignore_index=True)

# centroid_residents_plan = centroid(concat_data)

In [None]:
# norm_old_data = normalize(old_data, "modularity_class", "weight")

# centroid_residents_plan = centroid(norm_old_data)

In [None]:
# format_plan_res = format(plan_res_class, second_weight_name, normalize=False)
# format_plan_res["type_of_weight"] = second_type_of_weight
# format_plan_res

In [None]:
# format_plan_res.to_csv(res_path)

In [None]:
data1 = pd.read_csv(f"../data/clear results/{method}/{version}/{method}_subway_centroid.csv", index_col=0)
data1

In [None]:
data2 = pd.read_csv(f"../data/clear results/{method}/{version}/{method}_abs_centroid.csv", index_col=0)
data2

In [None]:
for i in range(data1.shape[0]):
    data1.iloc[i, -2] = data2.iloc[i, -2]

data1

In [None]:
data1.to_csv(res_path)

In [None]:
data1.loc[data1["type_of_weight"] == 11].to_csv(f"../data/clear results/{method}/{version}/{method}_clear_centroid.csv")