In [9]:
!pip install requests pandas
!pip install tqdm




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import requests
import pandas as pd
import time
from tqdm import tqdm

OVERPASS_URL = "http://overpass-api.de/api/interpreter"
WIKIPEDIA_API_URL = "https://ru.wikipedia.org/w/api.php"
WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php"

all_processed_pois = []

def get_wikipedia_summary(title):
    """Получает краткое описание из Википедии по названию статьи."""
    try:
        params = {
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "exintro": True,
            "explaintext": True,
            "redirects": 1
        }
        response = requests.get(WIKIPEDIA_API_URL, params=params, timeout=10)
        data = response.json()
        page_id = next(iter(data['query']['pages']))
        if page_id != '-1':
            return data['query']['pages'][page_id].get('extract', '')
        return ""
    except requests.exceptions.Timeout:
        print(f"Таймаут при запросе к Википедии для {title}")
        return ""
    except requests.exceptions.RequestException as e:
        return ""
    except Exception as e:
        return ""

def get_wikidata_description(qid):
    try:
        params = {
            "action": "wbgetentities",
            "format": "json",
            "ids": qid,
            "props": "descriptions",
            "languages": "ru|en"
        }
        response = requests.get(WIKIDATA_API_URL, params=params, timeout=10)
        data = response.json()
        entity = data['entities'].get(qid)
        if entity and 'descriptions' in entity:
            if 'ru' in entity['descriptions']:
                return entity['descriptions']['ru']['value']
            elif 'en' in entity['descriptions']:
                return entity['descriptions']['en']['value']
        return ""
    except requests.exceptions.Timeout:
        print(f"Таймаут при запросе к Wikidata для {qid}")
        return ""
    except requests.exceptions.RequestException as e:
        return ""
    except Exception as e:
        return ""

def create_text_description(tags):
    description_parts = []

    if 'name' in tags:
        description_parts.append(tags['name'])

    tag_translations = {
        'amenity': 'Тип',
        'shop': 'Магазин',
        'tourism': 'Туризм',
        'leisure': 'Досуг',
        'historic': 'Исторический объект',
        'cuisine': 'Кухня'
    }

    for key, name in tag_translations.items():
        if key in tags:
            description_parts.append(f"{name}: {tags[key].replace('_', ' ')}")

    if 'opening_hours' in tags:
        description_parts.append(f"Часы работы: {tags['opening_hours']}")
    if 'wheelchair' in tags and tags['wheelchair'] == 'yes':
        description_parts.append("Доступно для инвалидных колясок")
    if 'internet_access' in tags and tags['internet_access'] == 'yes':
        description_parts.append("Есть доступ в интернет")
    if 'phone' in tags:
        description_parts.append(f"Телефон: {tags['phone']}")
    if 'website' in tags:
        description_parts.append(f"Вебсайт: {tags['website']}")

    if 'description' in tags:
        description_parts.append(f"Описание: {tags['description']}")

    extra_description = ""
    if 'wikipedia' in tags:
        parts = tags['wikipedia'].split(':')
        if len(parts) >= 2:
            lang = parts[0]
            title = parts[-1]
            if lang == 'ru':
                wiki_summary = get_wikipedia_summary(title)
                if wiki_summary:
                    extra_description = wiki_summary
    elif 'wikidata' in tags:
        wikidata_id = tags['wikidata']
        wiki_description = get_wikidata_description(wikidata_id)
        if wiki_description:
            extra_description = wiki_description

    if extra_description:
        description_parts.append(extra_description)

    return ". ".join(description_parts)

def process_city_pois(city_name):
    print(f"\n--- Начинаем обработку города: {city_name} ---")
    overpass_query = f"""
    [out:json][timeout:180];
    area[name="{city_name}"]->.searchArea;
    (
      node["tourism"](area.searchArea);
      way["tourism"](area.searchArea);

      node["leisure"](area.searchArea);
      way["leisure"](area.searchArea);

      node["historic"](area.searchArea);
      way["historic"](area.searchArea);

      node["place"~"square|fountain"](area.searchArea);
      way["place"~"square|fountain"](area.searchArea);

      node["natural"~"park|wood|garden|beach|peak|water"](area.searchArea);
      way["natural"~"park|wood|garden|beach|peak|water"](area.searchArea);

      node["amenity"~"arts_centre|theatre|cinema|museum|library|nightclub|bar|restaurant|cafe|pub|food_court|community_centre|marketplace|atm|bank|clinic|hospital|pharmacy|post_office|police|fire_station|school|university|kindergarten|dentist|veterinary|parking|toilets|fountain|place_of_worship|courthouse|embassy|townhall|public_bath|sauna|stripclub|brothel"](area.searchArea);
      way["amenity"~"arts_centre|theatre|cinema|museum|library|nightclub|bar|restaurant|cafe|pub|food_court|community_centre|marketplace|place_of_worship"](area.searchArea);
      node["amenity"~"^(arts_centre|theatre|cinema|museum|library|nightclub|bar|restaurant|cafe|pub|food_court|marketplace|fountain|public_bath|sauna|stripclub|brothel)$"](area.searchArea);
      way["amenity"~"^(arts_centre|theatre|cinema|museum|library|nightclub|bar|restaurant|cafe|pub|food_court|marketplace|fountain|public_bath|sauna|stripclub|brothel)$"](area.searchArea);

      node["shop"~"^(mall|department_store|books|gift|souvenir|art|antiques|craft|boutique|jewelry|leather|music|shoes|toys|video)$"](area.searchArea);
      way["shop"~"^(mall|department_store|books|gift|souvenir|art|antiques|craft|boutique|jewelry|leather|music|shoes|toys|video)$"](area.searchArea);
    );
    out body;
    >;
    out skel qt;
    """

    amenities_to_include = [
        "arts_centre", "theatre", "cinema", "museum", "library", "nightclub", "bar",
        "restaurant", "cafe", "pub", "food_court", "marketplace", "fountain",
        "public_bath", "sauna", "stripclub", "brothel", "casino", "ferry_terminal",
        "attraction", "theme_park", "water_park", "zoo", "aquarium", "planetarium",
        "gallery", "viewpoint", "observatory"
    ]
    shops_to_include = [
        "mall", "department_store", "books", "gift", "souvenir", "art", "antiques",
        "craft", "boutique", "jewelry", "leather", "music", "shoes", "toys", "video",
        "kiosk", "convenience"
    ]

    amenity_regex = "|".join(amenities_to_include)
    shop_regex = "|".join(shops_to_include)

    overpass_query_filtered = f"""
    [out:json][timeout:180];
    area[name="{city_name}"]->.searchArea;
    (
      node["tourism"](area.searchArea);
      way["tourism"](area.searchArea);
      node["leisure"](area.searchArea);
      way["leisure"](area.searchArea);
      node["historic"](area.searchArea);
      way["historic"](area.searchArea);

      node["place"~"square|fountain"](area.searchArea);
      way["place"~"square|fountain"](area.searchArea);
      node["natural"~"park|wood|garden|beach|peak|water"](area.searchArea);
      way["natural"~"park|wood|garden|beach|peak|water"](area.searchArea);

      node["amenity"~"^{amenity_regex}$"](area.searchArea);
      way["amenity"~"^{amenity_regex}$"](area.searchArea);

      node["shop"~"^{shop_regex}$"](area.searchArea);
      way["shop"~"^{shop_regex}$"](area.searchArea);
    );
    out body;
    >;
    out skel qt;
    """

    print(f"🚀 Отправляем запрос к Overpass API для {city_name}...")
    try:
        response = requests.get(OVERPASS_URL, params={'data': overpass_query_filtered}, timeout=180)
        print(f"✅ Запрос для {city_name} выполнен с кодом: {response.status_code}")
        if response.status_code == 200:
            data = response.json()
        else:
            print(f"Ошибка выполнения запроса для {city_name}: {response.text}")
            data = {'elements': []}
    except requests.exceptions.Timeout:
        print(f"Таймаут при запросе к Overpass API для {city_name}")
        data = {'elements': []}
    except requests.exceptions.RequestException as e:
        print(f"Ошибка запроса к Overpass API для {city_name}: {e}")
        data = {'elements': []}


    current_city_pois = []
    print(f"⚙️ Начинаем обработку {len(data['elements'])} объектов для {city_name}...")

    for element in tqdm(data['elements'], desc=f"Обработка POI в {city_name}"):
        if 'tags' in element:
            tags = element['tags']

            if 'name' not in tags:
                continue

            lat, lon = (0, 0)
            if element['type'] == 'node':
                lat = element.get('lat')
                lon = element.get('lon')
            elif 'center' in element:
                lat = element['center'].get('lat')
                lon = element['center'].get('lon')

            if lat == 0 and lon == 0:
                continue

            text_description = create_text_description(tags)

            current_city_pois.append({
                'id': element['id'],
                'type': element['type'],
                'lat': lat,
                'lon': lon,
                'name': tags.get('name'),
                'city': city_name,
                'text_description': text_description,
                'tags': tags
            })

        time.sleep(0.005) # 5 миллисекунд

    print(f"👍 Обработано {len(current_city_pois)} релевантных объектов с названиями для {city_name}.")

    all_processed_pois.extend(current_city_pois)


if __name__ == "__main__":
    cities_to_process = ["Казань", "Санкт-Петербург", "Москва"]

    for city in cities_to_process:
        process_city_pois(city)
        time.sleep(10)

    print("\n--- Обработка всех городов завершена! ---")
    print(f"Всего собрано {len(all_processed_pois)} POI.")

    if all_processed_pois:
        final_df = pd.DataFrame(all_processed_pois)
        final_df.drop_duplicates(subset=['id'], inplace=True)
        print(f"Всего уникальных POI после удаления дубликатов: {len(final_df)}")

        output_filename = 'Dataset/poi_dataset_russia_filtered_enriched.csv'
        final_df.to_csv(output_filename, index=False)
        print(f"💾 Окончательный датасет успешно сохранен в '{output_filename}'")
    else:
        print("⚠️ Общий датасет пуст. Возможно, возникли ошибки при сборе данных.")


--- Начинаем обработку города: Казань ---
🚀 Отправляем запрос к Overpass API для Казань...
✅ Запрос для Казань выполнен с кодом: 200
⚙️ Начинаем обработку 73622 объектов для Казань...


Обработка POI в Казань: 100%|██████████| 73622/73622 [06:29<00:00, 189.09it/s]


👍 Обработано 3060 релевантных объектов с названиями для Казань.

--- Начинаем обработку города: Санкт-Петербург ---
🚀 Отправляем запрос к Overpass API для Санкт-Петербург...
✅ Запрос для Санкт-Петербург выполнен с кодом: 200
⚙️ Начинаем обработку 413806 объектов для Санкт-Петербург...


Обработка POI в Санкт-Петербург: 100%|██████████| 413806/413806 [37:42<00:00, 182.87it/s] 


👍 Обработано 16858 релевантных объектов с названиями для Санкт-Петербург.

--- Начинаем обработку города: Москва ---
🚀 Отправляем запрос к Overpass API для Москва...
✅ Запрос для Москва выполнен с кодом: 200
⚙️ Начинаем обработку 608685 объектов для Москва...


Обработка POI в Москва: 100%|██████████| 608685/608685 [54:07<00:00, 187.45it/s] 


👍 Обработано 21178 релевантных объектов с названиями для Москва.

--- Обработка всех городов завершена! ---
Всего собрано 41096 POI.
Всего уникальных POI после удаления дубликатов: 41087
💾 Окончательный датасет успешно сохранен в 'poi_dataset_russia_filtered_enriched.csv'


In [13]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import os

OVERPASS_URL = "http://overpass-api.de/api/interpreter"
WIKIPEDIA_API_URL = "https://ru.wikipedia.org/w/api.php"
WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php"

def get_wikipedia_summary(title):
    try:
        params = {
            "action": "query", "format": "json", "titles": title,
            "prop": "extracts", "exintro": True, "explaintext": True, "redirects": 1
        }
        response = requests.get(WIKIPEDIA_API_URL, params=params, timeout=10)
        data = response.json()
        page_id = next(iter(data['query']['pages']))
        if page_id != '-1':
            return data['query']['pages'][page_id].get('extract', '')
        return ""
    except requests.exceptions.Timeout:
        return ""
    except requests.exceptions.RequestException:
        return ""
    except Exception:
        return ""

def get_wikidata_description(qid):
    try:
        params = {
            "action": "wbgetentities", "format": "json", "ids": qid,
            "props": "descriptions", "languages": "ru|en"
        }
        response = requests.get(WIKIDATA_API_URL, params=params, timeout=10)
        data = response.json()
        entity = data['entities'].get(qid)
        if entity and 'descriptions' in entity:
            if 'ru' in entity['descriptions']:
                return entity['descriptions']['ru']['value']
            elif 'en' in entity['descriptions']:
                return entity['descriptions']['en']['value']
        return ""
    except requests.exceptions.Timeout:
        return ""
    except requests.exceptions.RequestException:
        return ""
    except Exception:
        return ""

def create_text_description(tags):
    description_parts = []
    if 'name' in tags:
        description_parts.append(tags['name'])

    tag_translations = {
        'amenity': 'Тип', 'shop': 'Магазин', 'tourism': 'Туризм',
        'leisure': 'Досуг', 'historic': 'Исторический объект', 'cuisine': 'Кухня'
    }
    for key, name in tag_translations.items():
        if key in tags:
            description_parts.append(f"{name}: {tags[key].replace('_', ' ')}")

    if 'opening_hours' in tags: description_parts.append(f"Часы работы: {tags['opening_hours']}")
    if 'wheelchair' in tags and tags['wheelchair'] == 'yes': description_parts.append("Доступно для инвалидных колясок")
    if 'internet_access' in tags and tags['internet_access'] == 'yes': description_parts.append("Есть доступ в интернет")
    if 'phone' in tags: description_parts.append(f"Телефон: {tags['phone']}")
    if 'website' in tags: description_parts.append(f"Вебсайт: {tags['website']}")
    if 'description' in tags: description_parts.append(f"Описание: {tags['description']}")

    extra_description = ""
    if 'wikipedia' in tags:
        parts = tags['wikipedia'].split(':')
        if len(parts) >= 2:
            lang = parts[0]
            title = parts[-1]
            if lang == 'ru':
                wiki_summary = get_wikipedia_summary(title)
                if wiki_summary: extra_description = wiki_summary
    elif 'wikidata' in tags:
        wikidata_id = tags['wikidata']
        wiki_description = get_wikidata_description(wikidata_id)
        if wiki_description: extra_description = wiki_description

    if extra_description: description_parts.append(extra_description)
    return ". ".join(description_parts)

def process_city_pois(city_name):
    print(f"\n--- Начинаем обработку города: {city_name} ---")

    amenities_to_include = [
        "arts_centre", "theatre", "cinema", "museum", "library", "nightclub", "bar",
        "restaurant", "cafe", "pub", "food_court", "marketplace", "fountain",
        "public_bath", "sauna", "casino", "ferry_terminal", "attraction",
        "theme_park", "water_park", "zoo", "aquarium", "planetarium", "gallery",
        "viewpoint", "observatory", "place_of_worship", "community_centre", "social_facility"
    ]
    shops_to_include = [
        "mall", "department_store", "books", "gift", "souvenir", "art", "antiques",
        "craft", "boutique", "jewelry", "leather", "music", "shoes", "toys", "video",
        "convenience", "supermarket", "bakery", "beverages", "confectionery", "deli",
        "farm", "greengrocer", "ice_cream", "pastry", "wine", "stationery", "sports",
        "fashion", "perfumery"
    ]

    amenity_filter = "|".join(amenities_to_include)
    shop_filter = "|".join(shops_to_include)

    overpass_query_strict = f"""
    [out:json][timeout:180];
    area[name="{city_name}"]->.searchArea;
    (
      node["tourism"](area.searchArea);
      way["tourism"](area.searchArea);
      relation["tourism"](area.searchArea);

      node["leisure"](area.searchArea);
      way["leisure"](area.searchArea);
      relation["leisure"](area.searchArea);

      node["historic"](area.searchArea);
      way["historic"](area.searchArea);
      relation["historic"](area.searchArea);

      node["place"~"square|fountain"](area.searchArea);
      way["place"~"square|fountain"](area.searchArea);

      node["natural"~"park|wood|garden|beach|peak|water|forest|island|ridge|valley|volcano|wetland|glacier"](area.searchArea);
      way["natural"~"park|wood|garden|beach|peak|water|forest|island|ridge|valley|volcano|wetland|glacier"](area.searchArea);
      relation["natural"~"park|wood|garden|beach|peak|water|forest|island|ridge|valley|volcano|wetland|glacier"](area.searchArea);

      node["amenity"~"^{amenity_filter}$"](area.searchArea);
      way["amenity"~"^{amenity_filter}$"](area.searchArea);

      node["shop"~"^{shop_filter}$"](area.searchArea);
      way["shop"~"^{shop_filter}$"](area.searchArea);

      node["landuse"~"forest|park|recreation_ground|village_green"](area.searchArea);
      way["landuse"~"forest|park|recreation_ground|village_green"](area.searchArea);
    );
    out body;
    >;
    out skel qt;
    """

    print(f"🚀 Отправляем фильтрованный запрос к Overpass API для {city_name}...")
    try:
        response = requests.get(OVERPASS_URL, params={'data': overpass_query_strict}, timeout=180)
        print(f"✅ Запрос для {city_name} выполнен с кодом: {response.status_code}")
        if response.status_code == 200:
            data = response.json()
        else:
            print(f"Ошибка выполнения запроса для {city_name}: {response.text}")
            data = {'elements': []}
    except requests.exceptions.Timeout:
        print(f"Таймаут при запросе к Overpass API для {city_name}. Попробуйте увеличить timeout.")
        data = {'elements': []}
    except requests.exceptions.RequestException as e:
        print(f"Ошибка запроса к Overpass API для {city_name}: {e}")
        data = {'elements': []}

    current_city_pois = []
    print(f"⚙️ Начинаем обработку {len(data['elements'])} объектов, полученных из Overpass для {city_name}...")

    for element in tqdm(data['elements'], desc=f"Обработка и обогащение POI в {city_name}"):
        if 'tags' in element:
            tags = element['tags']

            if 'name' not in tags:
                continue

            lat, lon = (0, 0)
            if element['type'] == 'node':
                lat = element.get('lat')
                lon = element.get('lon')
            elif element['type'] == 'way' and 'center' in element:
                lat = element['center'].get('lat')
                lon = element['center'].get('lon')
            elif element['type'] == 'relation' and 'center' in element:
                lat = element['center'].get('lat')
                lon = element['center'].get('lon')
            else:
                continue

            if lat == 0 and lon == 0:
                continue

            text_description = create_text_description(tags)

            current_city_pois.append({
                'id': element['id'],
                'type': element['type'],
                'lat': lat,
                'lon': lon,
                'name': tags.get('name'),
                'city': city_name,
                'text_description': text_description,
                'tags': tags
            })

        time.sleep(0.005)

    print(f"👍 Всего обработано и добавлено в список: {len(current_city_pois)} POI для {city_name}.")

    return current_city_pois


if __name__ == "__main__":
    output_filename = 'Dataset/poi_dataset_russia_filtered_enriched.csv'

    if os.path.exists(output_filename):
        print(f"Загружаем существующий датасет из '{output_filename}'...")
        existing_df = pd.read_csv(output_filename)
        print(f"Загружено {len(existing_df)} POI из существующего файла.")
    else:
        print(f"Файл '{output_filename}' не найден. Создадим новый датасет.")
        existing_df = pd.DataFrame()

    cities_to_add = ["Екатеринбург"]

    newly_collected_pois = []
    for city in cities_to_add:
        city_data = process_city_pois(city)
        newly_collected_pois.extend(city_data)
        time.sleep(10)

    print("\n--- Сбор данных для новых городов завершен! ---")
    print(f"Всего собрано {len(newly_collected_pois)} новых POI.")

    combined_pois = pd.concat([existing_df, pd.DataFrame(newly_collected_pois)], ignore_index=True)

    print(f"Всего POI до удаления дубликатов: {len(combined_pois)}")
    combined_pois.dropna(subset=['name', 'lat', 'lon'], inplace=True)
    combined_pois.drop_duplicates(subset=['id'], inplace=True)

    print(f"Итоговое количество уникальных и полных POI: {len(combined_pois)}")

    combined_pois.to_csv(output_filename, index=False)
    print(f"💾 Обновленный датасет успешно сохранен в '{output_filename}'")

Загружаем существующий датасет из 'poi_dataset_russia_filtered_enriched.csv'...
Загружено 46354 POI из существующего файла.

--- Начинаем обработку города: Екатеринбург ---
🚀 Отправляем фильтрованный запрос к Overpass API для Екатеринбург...
✅ Запрос для Екатеринбург выполнен с кодом: 200
⚙️ Начинаем обработку 173809 объектов, полученных из Overpass для Екатеринбург...


Обработка и обогащение POI в Екатеринбург: 100%|██████████| 173809/173809 [15:47<00:00, 183.35it/s] 


👍 Всего обработано и добавлено в список: 4109 POI для Екатеринбург.

--- Сбор данных для новых городов завершен! ---
Всего собрано 4109 новых POI.
Всего POI до удаления дубликатов: 50463
Итоговое количество уникальных и полных POI: 50463
💾 Обновленный датасет успешно сохранен в 'poi_dataset_russia_filtered_enriched.csv'


In [8]:
import pandas as pd
import ast
import re
from bs4 import BeautifulSoup
from dateutil import parser as dateparser

df = pd.read_csv('Dataset/poi_dataset_russia_filtered_enriched.csv', dtype=str)
df = df.drop_duplicates('id')

tags_df = (
    df['tags']
    .apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else {})
    .apply(pd.Series)
)
tags_df = tags_df.rename(columns={c: c.replace(':','_').lower() for c in tags_df.columns})
tags_df = tags_df.drop(columns=['name'], errors=True)

tags_df['address'] = (
    tags_df[['addr_street','addr_housenumber','addr_floor']]
    .fillna('')
    .agg(' '.join, axis=1)
    .str.strip()
)

def normalize_phone(x):
    if not isinstance(x, str) or not x.strip():
        return None
    digits = re.sub(r'\D','',x)
    if len(digits) == 10:
        return '+7' + digits
    if len(digits) == 11 and digits.startswith('8'):
        return '+7' + digits[1:]
    if len(digits) == 11 and digits.startswith('7'):
        return '+' + digits
    return None

tags_df['phone_e164'] = (
    tags_df.get('contact_phone','')
    .combine_first(tags_df.get('phone',''))
    .apply(normalize_phone)
)

def normalize_url(x):
    if pd.isnull(x) or not x:
        return None
    return x if re.match(r'^https?://', x) else 'http://' + x

tags_df['website'] = tags_df.get('contact_website','').apply(normalize_url)

tags_df['check_date'] = tags_df.get('check_date','').apply(
    lambda x: dateparser.parse(x).date().isoformat() if pd.notnull(x) and x else None
)

tags_df['opening_hours'] = (
    tags_df.get('opening_hours','')
    .str.replace(r'\s*;\s*','; ', regex=True)
    .str.strip()
)

def normalize_wheelchair(x):
    if pd.isnull(x):
        return None
    val = x.lower()
    if val in ('yes','true'):
        return True
    if val in ('no','false'):
        return False
    return val

tags_df['wheelchair'] = tags_df.get('wheelchair','').apply(normalize_wheelchair)

category_cols = ['amenity','shop','tourism','historic','cuisine','memorial']
tags_df['categories'] = tags_df[category_cols].apply(
    lambda row: [v for v in row if pd.notnull(v) and v!=''], axis=1
)

df_clean = pd.concat([df.drop(columns=['tags']), tags_df], axis=1)

df_clean['name'] = df_clean['name'].str.strip().str.title()
if 'city' in df_clean.columns:
    df_clean['city'] = df_clean['city'].str.strip().str.title()

def clean_text(x):
    if pd.isnull(x):
        return ''
    raw = BeautifulSoup(x, 'html.parser').get_text()
    return re.sub(r'\s+', ' ', raw).strip()

df_clean['text_description'] = df_clean['text_description'].apply(clean_text)

output_path = 'poi_dataset_cleaned.csv'
df_clean.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to ./{output_path}")


Cleaned dataset saved to ./poi_dataset_cleaned.csv


In [9]:
import json
import pandas as pd
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)

MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype="auto",
    low_cpu_mem_usage=True
)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    max_new_tokens=200,
    do_sample=False
)

def enrich_poi_llama(description: str) -> dict:
    prompt = f"""
You are a travel assistant. Given the following description in Russian, output a JSON object with exactly two keys:
  "summary": one-sentence thematic summary like "романтическое место для вечерней прогулки",
  "themes": list of 2–4 keywords describing main place types or themes (e.g. ["музей","парк"]).

Description:
\"\"\"{description.strip()}\"\"\"

Respond ONLY with the JSON object.
"""
    out = generator(prompt)[0]["generated_text"]
    json_str = out[out.find("{"): out.rfind("}")+1]
    return json.loads(json_str)

df = pd.read_csv("poi_dataset_cleaned.csv", dtype=str)
summaries, themes = [], []

for desc in tqdm(df["text_description"], desc="Enriching with Llama2-7B 4bit"):
    if not isinstance(desc, str) or not desc.strip():
        summaries.append(None)
        themes.append([])
        continue
    try:
        enriched = enrich_poi_llama(desc)
        summaries.append(enriched.get("summary"))
        themes.append(enriched.get("themes", []))
    except Exception as e:
        print("Error:", e)
        summaries.append(None)
        themes.append([])

df["summary"] = summaries
df["themes"]  = themes
df.to_csv("poi_dataset_enriched_local.csv", index=False, encoding="utf-8-sig")
print("Done! Saved to poi_dataset_enriched_local.csv")


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


NameError: name 'torch' is not defined