In [2]:
import datetime
import gc
import pickle
import random
import sys
from pathlib import Path

import pandas as pd
import pandas_profiling as pp
import unidecode

from sklearn.model_selection import train_test_split

pd.set_option("max_columns", 100)
pd.set_option("max_rows", 100)

In [3]:
data_folder = Path("data")

In [4]:
arus_cols_to_keep = [
    "ARUSKOD",
    "ARUSMEGYE",
    "ARUSTI1",
    "ARUSTI2",
    "ARUSTI3",
    "ARUSTI4",
    "Z_LAPFELE",
    "ZVEVOCSAT",
    "CCHANNEL",
    "CNETWORK",
    "CNETTYPE",
    "bevasarlokozpont",
    "egyiksincsakozelb",
    "felsooktatasiintezm",
    "furdo",
    "hotel",
    "irodahazhivatal",
    "korhazrendelo",
    "kosarlabdapalya",
    "kozepiskola",
    "mozi",
    "muzeum",
    "muvelodesikozpont",
    "ovoda",
    "parkjatszoter",
    "piac",
    "posta",
    "sportcsarnoksportpa",
    "sportletesitmeny",
    "stadioncsarnok",
    "strand",
    "szinhaz",
    "temeto",
    "templom",
    "teniszpalya",
    "uszoda",
    "udulo",
    "wellnessfitness",
]

lapok_cols_to_keep = [
    "LAPISSUGL",
    "LAPMEDTER",
    "LAPTERMCS",
    "LAPMEGJSZ",
    "LAPKIADO",
    "LAPARBRUT",
    "LAPARNET",
    "LAPCS1",
    "LAPCS2",
    "LAPPER1",
    "LAPPER2",
    "LAPMELL1",
    "LAPMELL2",
    "LAPMELL3",
    "LAPMELL4",
    "LAPMELL5",
    "LAPMELL6",
    "LAPMELL7",
    "LAPMELL8",
    "LAPMELL9",
    "LAPMELL10",
    "LAPFORMAT",
]

mozgas_cols_to_keep = [
    "LAPISSUGL",
    "KFDELDATE",
    "ARUSKOD",
    "NAPOK_POLCON",
    "ELOZO_NAPOK_POLCON",
    "NAPI_ELADOTT_DB",
    "ELOZO_NAPI_ELADOTT_DB"
]

In [5]:
arus = pd.read_csv(data_folder / "arusok.csv", usecols=arus_cols_to_keep)
lapok = pd.read_csv(data_folder / "lapok.csv", usecols=lapok_cols_to_keep)
mozgas = pd.read_csv(data_folder / "mozgas.csv", usecols=mozgas_cols_to_keep)

In [6]:
data = mozgas.merge(arus, how="left", on="ARUSKOD").merge(
    lapok, how="left", on="LAPISSUGL"
)

In [7]:
data_train, data_test, target_train, target_test = train_test_split(data.drop('NAPI_ELADOTT_DB', axis=1), data['NAPI_ELADOTT_DB'])

data_train.to_csv('homework/data_train.csv', index=False)
data_test.to_csv('homework/data_test.csv', index=False)
target_train.to_csv('homework/target_train.csv', index=False)
#target_test.to_csv('homework/target_test.csv', index=False)

In [12]:
categorical_variables = [
    "ARUSKOD",
    "LAPISSUGL",
    "ARUSMEGYE",
    "ARUSTI1",
    "ARUSTI2",
    "ARUSTI3",
    "ARUSTI4",
    "ZVEVOCSAT",
    "CCHANNEL",
    "CNETWORK",
    "CNETTYPE",
    "bevasarlokozpont",
    "egyiksincsakozelb",
    "felsooktatasiintezm",
    "furdo",
    "hotel",
    "irodahazhivatal",
    "korhazrendelo",
    "kosarlabdapalya",
    "kozepiskola",
    "mozi",
    "muzeum",
    "muvelodesikozpont",
    "ovoda",
    "parkjatszoter",
    "piac",
    "posta",
    "sportcsarnoksportpa",
    "sportletesitmeny",
    "stadioncsarnok",
    "strand",
    "szinhaz",
    "temeto",
    "templom",
    "teniszpalya",
    "uszoda",
    "udulo",
    "wellnessfitness",
    "LAPISSUGL",
    "LAPMEDTER",
    "LAPTERMCS",
    "LAPKIADO",
    "LAPCS1",
    "LAPCS2",
    "LAPPER1",
    "LAPPER2",
    "LAPMELL1",
    "LAPMELL2",
    "LAPMELL3",
    "LAPMELL4",
    "LAPMELL5",
    "LAPMELL6",
    "LAPMELL7",
    "LAPMELL8",
    "LAPMELL9",
    "LAPMELL10",
    "LAPFORMAT",
    "ARUSMEGYE",
    "CNETTYPE",
]

numeric_variables = [
    "Z_LAPFELE",
    "LAPARBRUT",
    "LAPARNET",
    "LAPMEGJSZ",
    "NAPOK_POLCON",
    "ELOZO_NAPOK_POLCON",
    "NAPI_ELADOTT_DB",
    "ELOZO_NAPI_ELADOTT_DB",
]

left_out = set(arus_cols_to_keep + lapok_cols_to_keep + mozgas_cols_to_keep).difference(
    set(numeric_variables + categorical_variables)
)

In [13]:
left_out

{'KFDELDATE'}

In [14]:
data_pp = data_train.join(target_train)

for col in numeric_variables:
    data_pp[col] = pd.to_numeric(data_pp[col])
    
for col in categorical_variables:
    data_pp[col] = data_pp[col].astype("object")

In [15]:
report = pp.ProfileReport(data, infer_dtypes=False)

In [16]:
report.to_file('output/pp_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]