In [1]:
import os
import pandas as pd
import numpy as np
import base64
from PIL import Image
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import os
from collections import defaultdict
import hashlib

import utils

In [2]:
curr_dir = os.getcwd()

In [3]:
# Data folder path
data_path = os.path.join(curr_dir, 'data')
all_data_files = os.listdir(data_path)
city_files = {}

for file_name in all_data_files:
    parts = file_name.split('_')
    city_key = parts[0]
    if city_key not in city_files:
        city_files[city_key] = {}
    if 'images' in file_name:
        city_files[city_key]['images'] = os.path.join(data_path, file_name)
    elif 'places' in file_name:
        city_files[city_key]['places'] = os.path.join(data_path, file_name)

# for city, files in city_files.items():
#     print(f"City: {city}")
#     print(f": {files.get('images', 'Не найден')}")
#     print(f"Файл мест: {files.get('places', 'Не найден')}")
#     print()

In [4]:
def prosess_data(city_files):
    merged_data_all = None

    for city, files in city_files.items():
        places = pd.read_csv(files.get('places'), sep = ',')
        images = pd.read_csv(files.get('images'), sep = ',')

        aggregated_places = places.groupby('WikiData').agg({'Name': utils.aggregate_names,
                                                            'Kind': 'first',
                                                            'City': 'first',
                                                            'Rate': 'first',
                                                            'Lon': 'mean',
                                                            'Lat': 'mean'}).reset_index()

        aggregated_places_exploded = aggregated_places.explode('Name')
        images_exploded = images.explode('name')

        merged_data = pd.merge(aggregated_places_exploded, 
                               images_exploded, 
                               left_on='Name', right_on='name', how='left')
        
        # Вывод записей, которым не удалось найти пару
        unmatched_records = merged_data[merged_data['name'].isnull()]
        if not unmatched_records.empty:
            print(unmatched_records)

        merged_data_grouped = merged_data.groupby('WikiData')['Name'].agg(utils.choose_name).reset_index()
        merged_data = pd.merge(merged_data_grouped, 
                               merged_data, 
                               on='WikiData', how='left')
        merged_data.rename(columns={'Name_x': 'Name'}, inplace=True)
        merged_data.drop(columns=['Name_y', 'name'], inplace=True)

        output_dir = os.path.join(curr_dir, 'data_by_name')
        os.makedirs(output_dir, exist_ok=True)
        image_paths = []

        for index, row in merged_data.iterrows():
            with open("success.txt", "a") as out:
                # print to file 
                print(row['Name'], file=out)
            image_data = base64.b64decode(row['image'])
            class_dir = os.path.join(output_dir, utils.replace_forbidden_chars(row['Name']))
            os.makedirs(class_dir, exist_ok=True)
            image_path = os.path.join(class_dir, f"image_{index}.jpg")
            with open(image_path, "wb") as file:
                file.write(image_data)
            image_paths.append(image_path)

        merged_data['image_path'] = image_paths
        merged_data.drop(columns=['image'], inplace=True)

        if merged_data_all is not None:
            merged_data_all = pd.concat([merged_data_all, merged_data], ignore_index=True)
        else:
            merged_data_all = merged_data

    csv_file_path = 'processed_data.csv'
    merged_data_all.to_csv(csv_file_path, index=False)
    return merged_data_all

In [5]:
merged_data_all = None
merged_data_all = prosess_data(city_files)

In [6]:
merged_data_all.head()

Unnamed: 0,WikiData,Name,Kind,City,Rate,Lon,Lat,image_path
0,Q106150540,"Здание бывшей гостиницы ""Мадрид""","architecture,historic_architecture,accomodatio...",Екатеринбург,3h,60.588634,56.886154,c:\Users\alex\Desktop\NTO\data_by_name\Здание ...
1,Q106150540,"Здание бывшей гостиницы ""Мадрид""","architecture,historic_architecture,accomodatio...",Екатеринбург,3h,60.588634,56.886154,c:\Users\alex\Desktop\NTO\data_by_name\Здание ...
2,Q106150540,"Здание бывшей гостиницы ""Мадрид""","architecture,historic_architecture,accomodatio...",Екатеринбург,3h,60.588634,56.886154,c:\Users\alex\Desktop\NTO\data_by_name\Здание ...
3,Q106150540,"Здание бывшей гостиницы ""Мадрид""","architecture,historic_architecture,accomodatio...",Екатеринбург,3h,60.588634,56.886154,c:\Users\alex\Desktop\NTO\data_by_name\Здание ...
4,Q106150540,"Здание бывшей гостиницы ""Мадрид""","architecture,historic_architecture,accomodatio...",Екатеринбург,3h,60.588634,56.886154,c:\Users\alex\Desktop\NTO\data_by_name\Здание ...
