# Carga de datos inicial

Construcción de la muestra que usaremos como conjunto de entrenamiento y de la que usaremos como validación. Se construyen seleccionando los casos en 'stream'.

También hemos exportado los datos a parquet.

In [4]:
import numpy as np
import pandas as pd
import os
import random
import re
import funciones_auxiliares as f
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
pd.read_csv("..\\data\\Por fecha\\UK_Property_Price_Data_1995.csv", 
header=0, 
sep=',', nrows=3).head(2)

Unnamed: 0,Transaction unique identifier,Price,Date of Transfer,Postcode,Property Type,Old/New,Duration,PAON,SAON,Street,Locality,Town/City,District,County,PPD Category Type,Record Status - monthly file only
0,{940736E4-9649-4080-B74D-1DA6A8A60FA2},35000,1995-01-01 00:00,PL2 1PH,T,N,F,62,,ALEXANDRA ROAD,FORD,PLYMOUTH,PLYMOUTH,DEVON,A,A
1,{E32B0A70-13E1-4F34-8E38-4D7950245171},50000,1995-01-01 00:00,BS1 6XF,F,N,L,WEARE COURT,7.0,CANADA WAY,BRISTOL,BRISTOL,BRISTOL,AVON,A,A


De aquí podemos sacar el tipo de las columnas, con lo que se acelera mucho la carga en demanda de las columnas de estos ficheros:

In [10]:
columns_dtype = {'Transaction unique identifier' : np.dtype(str),
                 'Price' : np.dtype(int),
                 'Date of Transfer' : np.dtype(str),
                 'Postcode' : np.dtype(str),
                 'Property Type' : np.dtype(str),
                 'Old/New' : np.dtype(str),
                 'Duration' : np.dtype(str),
                 'PAON' : np.dtype(str),
                 'SAON' : np.dtype(str),
                 'Street' : np.dtype(str),
                 'Locality' : np.dtype(str),
                 'Town/City' : np.dtype(str),
                 'District' : np.dtype(str),
                 'County' : np.dtype(str),
                 'PPD Category Type' : np.dtype(str),
                 'Record Status - monthly file only' : np.dtype(str)}

In [4]:
example = f.load_date(1999, nrows=-1, columns_dtype=columns_dtype, percentage=0.01, seed=33)

example.head(1)

Unnamed: 0,Transaction.unique.identifier,Price,Date.of.Transfer,Postcode,Property.Type,Old.New,Duration,PAON,SAON,Street,Locality,Town.City,District,County,PPD.Category.Type,Record.Status...monthly.file.only
0,{B487E3FB-3907-4D42-9A45-C907175616A8},49000,1999-01-01 00:00,CT11 7BB,Adosado,Segunda_mano,Propiedad,56,,FAIRFIELD ROAD,,RAMSGATE,THANET,KENT,A,A


Generamos muestra de los datos:

In [7]:
for year in np.arange(1995, 2024, 1):
    f.load_date(year, columns_dtype=columns_dtype, percentage=0.05, seed=33).to_csv(
        path_or_buf=f'../data/sample/{year}.csv', sep=','
    )

Separar en Train y Test:

+ Train: datos desde el 1995 al 2020

+ Test: datos desde el 2021 al 2023

In [14]:
# Train set
for year in np.arange(1995, 2021, 1):
    f.load_date(year, columns_dtype=columns_dtype, percentage=0.1, seed=1).to_csv(
        path_or_buf=f'../data/train/{year}.csv', sep=','
    )

In [15]:
# Test set
for year in np.arange(2021, 2024, 1):
    f.load_date(year, columns_dtype=columns_dtype, percentage=0.1, seed=1).to_csv(
        path_or_buf=f'../data/test/{year}.csv', sep=','
    )

## Formato parquet

https://stackoverflow.com/questions/26124417/how-to-convert-a-csv-file-to-parquet

In [12]:
# csv_file = '/path/to/my.tsv'
parquet_file = '../data/parquet/test/test.parquet'
# chunksize = 100_000

# csv_stream = pd.read_csv(csv_file, sep='\t', chunksize=chunksize, low_memory=False)

for i,year in enumerate(np.arange(2021, 2024, 1)):
    print(f"\rChunk{i}", end="")
    chunk = f.load_date(year, columns_dtype=columns_dtype)
    if i == 0:
        # Guess the schema of the CSV file from the first chunk
        parquet_schema = pa.Table.from_pandas(df=chunk).schema
        # Open a Parquet file for writing
        parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
    # Write CSV chunk to the parquet file
    table = pa.Table.from_pandas(chunk, schema=parquet_schema)
    parquet_writer.write_table(table)

parquet_writer.close()

Chunk 0
Chunk 1
Chunk 2


In [13]:
parquet_file = '../data/parquet/train/train.parquet'

for i,year in enumerate(np.arange(1995, 2021, 1)):
    print(f"\rChunk {i}", end="")
    chunk = f.load_date(year, columns_dtype=columns_dtype)
    if i == 0:
        # Guess the schema of the CSV file from the first chunk
        parquet_schema = pa.Table.from_pandas(df=chunk).schema
        # Open a Parquet file for writing
        parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
    # Write CSV chunk to the parquet file
    table = pa.Table.from_pandas(chunk, schema=parquet_schema)
    parquet_writer.write_table(table)

parquet_writer.close()

Chunk25