### Import bibliotek

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import time

import utils as u

### Wczytanie danych

In [2]:
df_train = pd.read_hdf("../input/train_data.h5")
df_test = pd.read_hdf("../input/test_data.h5")

print('train notna price: ', df_train['price'].notna().all())
print('test is price: ', df_train['price'].isna().all())

df = pd.concat([df_train, df_test])
# Indeksy są od 0 w obu ramkach, więc muszę zrobić reset
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.dtypes

train notna price:  True
test is price:  False
(34180, 8)


geo_block      object
breadcrumbs    object
price          object
owner          object
params         object
date           object
user_block     object
id              int64
dtype: object

In [3]:
def parse_price(val):
    if isinstance(val, str): 
        if "₽" in val:
            val = val.split('₽')[0]
            
        val = val.replace(' ', '')
        return int(val) / 1000000
    
    return float(val)

df['price'] = df['price'].map(parse_price)

### geo_block

In [4]:
df['geo_block'].apply(lambda x: len(x)).value_counts()

4     16189
2      9453
6      6158
8      2370
10        8
12        2
Name: geo_block, dtype: int64

In [5]:
df['geo_block'].apply(lambda x: len(list(set(x)))).value_counts()

2    16189
1     9453
3     6169
4     2359
5        8
6        2
Name: geo_block, dtype: int64

**Usunięcie duplikatów i utworzenie 5 nowych kolumn z danymi geo**

In [6]:
df['geo_block_len'] = df['geo_block'].apply(lambda x: len(list(set(x))))

In [8]:
for idx in range(5):
    df[f'geo_block_{idx}'] = df['geo_block'].apply(lambda x: u.get_list_el(list(set(x)),idx))
    
df.drop(columns='geo_block', inplace=True)

### breadcrumbs    

In [9]:
df['breadcrumbs'].apply(lambda x: len(x)).value_counts()

4    13522
2     8481
5     5171
3     4415
7     2124
1      386
6       78
8        3
Name: breadcrumbs, dtype: int64

In [10]:
df['breadcrumbs'].apply(lambda x: len(list(set(x)))).value_counts()

4    13522
2     8481
5     5171
3     4415
7     2113
1      386
6       89
8        3
Name: breadcrumbs, dtype: int64

**utworzenie 8 nowych kolumn z danymi breadcrumbs**

In [11]:
df['breadcrumbs_len'] = df['breadcrumbs'].apply(lambda x: len(list(set(x))))

In [13]:
for idx in range(5):
    df[f'breadcrumbs_{idx}'] = df['breadcrumbs'].apply(lambda x: u.get_list_el(list(set(x)),idx))
    
df.drop(columns='breadcrumbs', inplace=True)

### owner          

In [14]:
df['owner'].apply(lambda x: len(x)).value_counts()

0    30647
1     3533
Name: owner, dtype: int64

In [15]:
df['owner'].apply(lambda x: len(list(set(x)))).value_counts()

0    30647
1     3533
Name: owner, dtype: int64

**utworzenie kolumny z nazwą właściciela**

In [16]:
df['owner_len'] = df['owner'].apply(lambda x: len(list(set(x))))

df['owner_name'] = df['owner'].apply(lambda x: u.get_list_el(list(set(x)),0))
df.drop(columns='owner', inplace=True)

### date    

In [17]:
df['date'].apply(lambda x: len(x)).value_counts()

4    27282
3     6853
2       45
Name: date, dtype: int64

**utworzenie 4 nowych kolumn z danymi datowymi**

In [18]:
df['date_len'] = df['date'].apply(lambda x: len(list(set(x))))

In [19]:
for idx in range(4):
    df[f'date_{idx}'] = df['date'].apply(lambda x: u.get_list_el(list(set(x)),idx))
    
df.drop(columns='date', inplace=True)

### user_block    

In [20]:
df['user_block'].apply(lambda x: len(x)).value_counts()

2    30637
0     3543
Name: user_block, dtype: int64

**utworzenie 2 nowych kolumn z danymi user_block**

In [21]:
df['user_block_len'] = df['user_block'].apply(lambda x: len(list(set(x))))

In [22]:
for idx in range(2):
    df[f'user_block_{idx}'] = df['user_block'].apply(lambda x: u.get_list_el(list(set(x)),idx))
    
df.drop(columns='user_block', inplace=True)

### params

In [23]:
params = df["params"].apply(pd.Series)
params = params.fillna(-1)

if "Охрана:" not in df:
    df = pd.concat([df, params], axis=1)

In [24]:
df.drop(columns='params', inplace=True)

### adres


In [25]:
df['Адрес:'].nunique()

def split_str_list(string, sep):
    
    try:
        return string.split(sep)
    except: 
        return None

df['Адрес:'] = df['Адрес:'].apply(lambda x: split_str_list(x,','))

In [26]:
df['Адрес:'].apply(lambda x: len(x)).value_counts()

1     26866
3      2192
4      1944
5      1717
6       730
7       259
2       245
12       97
8        83
15       39
9         4
10        2
21        1
11        1
Name: Адрес:, dtype: int64

**utworzenie 15 nowych kolumn z rozdzielonym adresem**

In [27]:
df['Адрес:_len'] = df['Адрес:'].apply(lambda x: len(list(set(x))))

In [29]:
for idx in range(15):
    df[f'Адрес:_{idx}'] = df['Адрес:'].apply(lambda x: u.get_list_el(list(set(x)),idx))
    
df.drop(columns='Адрес:', inplace=True)

In [30]:
df[df.price.notna()].to_csv("../interim/01_train_data.csv", sep='|', index=False)
df[df.price.isna()].to_csv("../interim/01_test_data.csv", sep='|', index=False)

## KONIEC