# Обработка данных 

## Подключение 

In [1]:
import sys
print('Версия Python:', sys.version)

import numpy as np 
np.random.seed(42)


import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import re
import os

Версия Python: 3.11.8 (main, Feb 26 2024, 15:36:12) [Clang 14.0.6 ]


## Данные 

In [2]:
folders = ["data/" + i for i in os.listdir(f"data/") if "folder" in i]
print(folders)

['data/folder5', 'data/folder2', 'data/folder3', 'data/folder4', 'data/folder10', 'data/folder8', 'data/folder1', 'data/folder6', 'data/folder7', 'data/folder9']


In [3]:
dfs = []

for folder in folders:
    for filename in os.listdir(folder):
        if filename.endswith('.parquet'): 
            filepath = os.path.join(folder, filename)
            current_df = pd.read_parquet(filepath)
            dfs.append(current_df)

df = pd.concat(dfs, ignore_index=True)
print(df.shape)

df = df.drop_duplicates(subset=['link'])
df = df.dropna()
print(df.shape)

(7275, 7)
(4596, 7)


In [4]:
4596 / 7275

0.6317525773195877

## Обработка данных 

In [5]:
def get_count_rooms(lst):
    lst = list(filter(lambda x: 'Количество комнат' in x, lst))
    if len(lst) > 0: 
        return lst[0].replace('Количество комнат:', '')
    
def get_total_area(lst):
    lst = list(filter(lambda x: 'Общая площадь' in x, lst))
    if len(lst) > 0: 
        return float(re.sub(r'\xa0|м²', '', lst[0].replace('Общая площадь:', '')))

def get_kitchen_area(lst):
    lst = list(filter(lambda x: 'Площадь кухни' in x, lst))
    if len(lst) > 0: 
        return float(re.sub(r'\xa0|м²', '', lst[0].replace('Площадь кухни:', '')))

def get_living_area(lst):
    lst = list(filter(lambda x: 'Жилая площадь' in x, lst))
    if len(lst) > 0: 
        return float(re.sub(r'\xa0|м²', '', lst[0].replace('Жилая площадь:', '')))

def get_floor_info(lst):
    lst = list(filter(lambda x: 'Этаж' in x, lst))
    if len(lst) > 0:
        match = re.search(r'Этаж:(\d+)\s*из\s*(\d+)', lst[0])
        if match:
            current_floor = int(match.group(1))
            total_floors = int(match.group(2))
            return current_floor, total_floors
    return None, None

def get_balcony_info(lst):
    lst = list(filter(lambda x: 'Балкон или лоджия' in x, lst))
    if len(lst) > 0: 
        return lst[0].replace('Балкон или лоджия:', '')

def get_extra_info(lst):
    lst = list(filter(lambda x: 'Дополнительно' in x, lst))
    if len(lst) > 0: 
        return lst[0].replace('Дополнительно:', '')

def get_room_type(lst):
    lst = list(filter(lambda x: 'Тип комнат' in x, lst))
    if len(lst) > 0: 
        return lst[0].replace('Тип комнат:', '')

def get_bathroom_info(lst):
    lst = list(filter(lambda x: 'Санузел' in x, lst))
    if len(lst) > 0: 
        return lst[0].replace('Санузел:', '')

def get_repaire_info(lst):
    lst = list(filter(lambda x: 'Ремонт' in x, lst))
    if len(lst) > 0: 
        return lst[0].replace('Ремонт:', '')

def get_furniture_info(lst):
    lst = list(filter(lambda x: 'Мебель' in x, lst))
    if len(lst) > 0: 
        return lst[0].replace('Мебель:', '')

def get_technique_info(lst):
    lst = list(filter(lambda x: 'Техника' in x, lst))
    if len(lst) > 0: 
        return lst[0].replace('Техника:', '')

def get_tv_info(lst):
    lst = list(filter(lambda x: 'Интернет и ТВ' in x, lst))
    if len(lst) > 0: 
        return lst[0].replace('Интернет и ТВ:', '')

df['characteristics'] = df['characteristics'].apply(eval)
df['photo'] = df['photo'].apply(lambda x: list(set(eval(x))))
df['price'] = df['price'].astype(int)

df['count_rooms'] = df['characteristics'].apply(get_count_rooms)
df['count_total_area'] = df['characteristics'].apply(get_total_area)
df['kitchen_area'] = df['characteristics'].apply(get_kitchen_area)
df['living_area'] = df['characteristics'].apply(get_living_area)
df['current_floor'] = df['characteristics'].apply(lambda x: get_floor_info(x)[0])
df['total_floor'] = df['characteristics'].apply(lambda x: get_floor_info(x)[1])
df['balcony_info'] = df['characteristics'].apply(get_balcony_info)
df['extra_info'] = df['characteristics'].apply(get_extra_info)
df['room_type'] = df['characteristics'].apply(get_room_type)
df['bathroom_info'] = df['characteristics'].apply(get_bathroom_info)
df['repaire_info'] = df['characteristics'].apply(get_repaire_info)
df['furniture_info'] = df['characteristics'].apply(get_furniture_info)
df['technique_info'] = df['characteristics'].apply(get_technique_info)
df['tv_info'] = df['characteristics'].apply(get_tv_info)

# df = df.dropna()
print(df.shape[0])

1667


In [6]:
df.sample(1)

Unnamed: 0,title,price,characteristics,description,location,link,photo,count_rooms,count_total_area,kitchen_area,living_area,current_floor,total_floor,balcony_info,extra_info,room_type,bathroom_info,repaire_info,furniture_info,technique_info,tv_info
1435,"3-к. квартира, 100 м², 6/6 эт.",110000,"[Количество комнат:3, Общая площадь:100 м², Площадь кухни:32 м², Жилая площадь:50 м², Этаж:6 из 6, Балкон или лоджия:балкон, Тип комнат:изолированные, Санузел:раздельный, Ремонт:евро, Мебель:кухня, хранение одежды, спальные места, Техника:холодильник, плита, стиральная машина, посудомоечная машина, телевизор, Интернет и ТВ:Wi-Fi, телевидение]","Kваpтиpa 100 мeтpов квадратных нa Мoсковскoм Пpoспектe. B пeшeй дocтупности от метрo Пapк Пoбеды. Квapтиpа поcле капитaльногo pемoнтa. Полноcтью обоpудoвaнa мeбeлью и бытoвой тeхникой. В кухнe-гoстиной рaсполoжeн большoй угловoй дивaн, электрoкамин и телeвизоp, стол и кухонный гарнитур. Две спальни. Есть гардеробная комната. Также имеется вместительный шкаф в прихожей. Два санузла. Ванная комната 10 метров с окном.","Санкт-Петербург, Московский пр-т, 198",https://www.avito.ru/sankt-peterburg/kvartiry/3-k._kvartira_100_m_66_et._2648385544,"[https://60.img.avito.st/image/1/1.3RuSs7a6cfKkGrP3pJyqd1gQd_gmkHkwIxBz9DIWcw.4T5Aj7cTFiXbFpUHgry5PYAZfal_mjowBXZDdvXPx00, https://80.img.avito.st/image/1/1.rPJBCra6ABt3o8IeFXSksYOpBhH1KQjZ8KkCHeGvAg.n__ZJdfOwZgfONFJc-A9JpQ5fIJLzGgE4f_XIz_y8M4, https://50.img.avito.st/image/1/1.n2dNa7a6M457wvGLfy2XJI_INYT5SDtM_MgxiO3OMQ.83_IzJ20fud7tzT8vGNcO5124_wPJJtAR2lbLaG6nx0, https://90.img.avito.st/image/1/1.DyM6WLa6o8oM8WHPKBwHYPj7pcCOe6sIi_uhzJr9oQ.Aayl9TQrnzzP0YP51D9991ojMPX9pNtY8B-BbcWzq0Q, https://70.img.avito.st/image/1/1.PtdDQ7a6kj516lA7Mxd2vI7glDT3YJr88uCQOOPmkA.TOjNyj5iiCfxfCwHvd6EWwG-rYua1nM0mZiBq-v1quM, https://80.img.avito.st/image/1/1.6wsmDra6R-IQp4XnfGWnZ-ytQeiSLU8gl61F5IarRQ.Kb1DKHsDI7CqG0rReEmiOfNYLxHq_7h1OcSJaABxTDc, https://40.img.avito.st/image/1/1.uUiIULa6FaG--dek9hqxC0rzE6s8cx1jOfMXpyj1Fw.Rg0Qv4iNMr6sf2MDFlyPNRq60BVswQHyylhj9adjIJE, https://60.img.avito.st/image/1/1.LBhDura6gPF1E0L0F5osW4EZhvv3mYgz8hmC9-Mfgg.Cs1hGtFEzlIxKo37FqZhDF2BF5Qg3iTIIBuvlsSG-_4, https://40.img.avito.st/image/1/1.tPJBPra6GBt3l9oeJXS8sYOdHhH1HRDZ8J0aHeGbGg.yDMkwFVxo1QqCRWGPP4VyTlhp7Qz_oT8PDp9G6fNzIo, https://10.img.avito.st/image/1/1.aqPgU7a6xkrW-gRPuDomzyrwwEBUcM6IUfDETED2xA.d9yJ6dPpxiVS7xiz_U3EKNuaCi_ooNCX0PcihFoC3BU, https://00.img.avito.st/image/1/1.NpZc5ba6mn9qTFh6PJk-1Z5GnHXoxpK97UaYefxAmA.xOkkXCNV0TUctkgt97PvBhZLIRdlR9XUTYk004V-_VI, https://60.img.avito.st/image/1/1.2ysm4La6d8IQSbXHHM-sR-xDcciSw38Al0N1xIZFdQ.4vdF9-uVkZVAauHhjRpj97Xd7-LP48xGvYQSDCKYZ-4, https://00.img.avito.st/image/1/1.BdhLWba6qTF98Gs0TXEfm4n6rzv_eqHz-vqrN-v8qw.DrE4ybDIJZqK65iXplAs9v5y0Bj0-YfXjq9VHHwgztI, https://90.img.avito.st/image/1/1.DcmFTLa6oSCz5WMlkwgFikfvpyoxb6niNO-jJiXpow.RQbCIY4VQvy6WgGm8bNCXJ7KVHlTDxVDT7mHl6EBJsw, https://80.img.avito.st/image/1/1.r2dNd7a6A4573sGLHwmnJI_UBYT5VAtM_NQBiO3SAQ.bPNJfcJJyjpmJNx91Oj0Qe3QIBCzf-QWaFcFJHSL6w0, https://20.img.avito.st/image/1/1.E4N5Hra6v2pPt31vU2YbwLu9uWDNPbeoyL29bNm7vQ.0obygDPypKboUjB2yPrNnUj-IN5-_kypOIvycEbfVeI, https://80.img.avito.st/image/1/1.uKVcjra6FExqJ9ZJIIyau5EtEkborRyO7S0WSvwrFg.qTKc3pWR_PTsnc0zQD9AOkXc_1vt9dpr2WDMIRrx9lI, https://40.img.avito.st/image/1/1.zdhDaba6YTF1wKM0VVrfm4HKZzv3Smnz8spjN-PMYw.DbzBClY2PtWhjOSi-ZcIZB6ihdNdg5wnWJTpnz6IwNE, https://10.img.avito.st/image/1/1.I-N5Hra6jwpPt00PM2oroLu9iQDNPYfIyL2NDNm7jQ.TI-AK5PD0TI6VZlVa0fUBjJJqmey9vBupaWaLoClkVg, https://00.img.avito.st/image/1/1.8UiJgba6XaG_KJ-k58LkC0siW6s9olVjOCJfpykkXw.vNz9moeJqSWi-yv276Nsa5FQkYqZC2jS2mfjeLQ-TGo]",3,100.0,32.0,50.0,6,6,балкон,,изолированные,раздельный,евро,"кухня, хранение одежды, спальные места","холодильник, плита, стиральная машина, посудомоечная машина, телевизор","Wi-Fi, телевидение"


In [8]:
df.iloc[1435]['photo']

['https://50.img.avito.st/image/1/1.aKlpS7a5xEBf4gZFNWJUnXfqxkbX6kZIH-_GQtv-wEI.JRdgdOvqyTu9D3O4ymWIUJjZNS-RiUqM-ky_4G3sOOo',
 'https://90.img.avito.st/image/1/1.OvsIiba5lhI-IFQXcNUCzxYolBS2KBQafi2UELo8khA.epvk3QFmZwBo60ZHGlOYkkB3yoTu_eAbDEwRVBFCMcw',
 'https://90.img.avito.st/image/1/1.6K8ShLa5REYkLYZDTqeLojolRkCsJcZOZCBGRKAxQEQ.I2kJpORaLuwy-qHyRpyI-xYmOgARuYMUA0cPSD2SPSU',
 'https://50.img.avito.st/image/1/1.C6nknba5p0DSNGVF_oFjpMw8pUZaPCVIkjmlQlYoo0I.XtktmVU9q8Z22oMXzkDNvNoFHax2GbGuHuxgs13nyZs',
 'https://60.img.avito.st/image/1/1.maNIKra5NUp-g_dPdgjxrmCLN0z2i7dCPo43SPqfMUg.fm4-fPeJU7gF2xb3lpxwLsJeGhlTosP9O_qNveZF3rM',
 'https://00.img.avito.st/image/1/1.D9L0tLa5ozvCHWE-5pA05uoVoT1KFSEzghChOUYBpzk.zqNWfUBJF2hNeMM5BTmAdLUTvYrVJhS_Q8vWE1laMuE']