In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [4]:
train.fecha = pd.to_datetime(train.fecha)

In [5]:
train_date = train.loc[:,['id','fecha']]

In [6]:
train_date.head()

Unnamed: 0,id,fecha
0,254099,2015-08-23
1,53461,2013-06-28
2,247984,2015-10-17
3,209067,2012-03-09
4,185997,2016-06-07


In [7]:
train_date['year'] = train_date.fecha.dt.year
train_date['mes'] = train_date.fecha.dt.month
train_date['dia_del_mes'] = train_date.fecha.dt.day
train_date['dia_del_anio'] = train_date.fecha.dt.dayofyear
train_date['quarter'] = train_date.fecha.dt.quarter

In [12]:
train_date['dia_de_semana'] = train_date.fecha.apply(lambda x: x.date().weekday())
train_date['es_finde'] = train_date.fecha.apply(lambda x: 1 if x.date().weekday() in (5, 6) else 0)

In [13]:
train_date.head()

Unnamed: 0,id,fecha,year,mes,dia_del_mes,dia_del_anio,quarter,dow,dia_de_semana,es_finde
0,254099,2015-08-23,2015,8,23,235,3,6,6,1
1,53461,2013-06-28,2013,6,28,179,2,4,4,0
2,247984,2015-10-17,2015,10,17,290,4,5,5,1
3,209067,2012-03-09,2012,3,9,69,1,4,4,0
4,185997,2016-06-07,2016,6,7,159,2,1,1,0


In [14]:
def make_harmonic_features(value, period):
    value *= 2 * np.pi / period 
    return np.cos(value), np.sin(value)

In [16]:
train_date['harmonic_dia_de_semana'] = train_date.dia_de_semana.apply(lambda x: make_harmonic_features(x,7))

In [17]:
train_date.harmonic_dia_de_semana

0          (0.6234898018587334, -0.7818314824680299)
1          (-0.9009688679024191, -0.433883739117558)
2         (-0.2225209339563146, -0.9749279121818236)
3          (-0.9009688679024191, -0.433883739117558)
4           (0.6234898018587336, 0.7818314824680298)
5           (0.6234898018587336, 0.7818314824680298)
6           (0.6234898018587336, 0.7818314824680298)
7          (-0.900968867902419, 0.43388373911755823)
8                                         (1.0, 0.0)
9          (-0.900968867902419, 0.43388373911755823)
10         (-0.900968867902419, 0.43388373911755823)
11                                        (1.0, 0.0)
12                                        (1.0, 0.0)
13          (0.6234898018587336, 0.7818314824680298)
14          (0.6234898018587336, 0.7818314824680298)
15          (0.6234898018587336, 0.7818314824680298)
16        (-0.22252093395631434, 0.9749279121818236)
17         (0.6234898018587334, -0.7818314824680299)
18        (-0.2225209339563146, -0.97492791218

In [183]:
from sklearn.model_selection import TimeSeriesSplit

In [184]:
train_sample = train.sample(10000)

In [185]:
train_sample.head()

Unnamed: 0,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,...,idzona,lat,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
220954,98893,terreno en venta en puebla,<p>terreno bien ubicado en calle galactica no....,Terreno,Galactica 10,Puebla,Puebla,0.0,,0.0,...,,,,2013-11-03,0.0,0.0,0.0,0.0,0.0,964800.0
91923,107209,preciosa casa en lomas de la herradura,<p>c&oacute;moda casa en excelente estado de c...,Casa,Seminario,Huixquilucan,Edo. de México,30.0,4.0,3.0,...,55576.0,19.411919,-99.265075,2013-11-19,0.0,1.0,0.0,1.0,1.0,5300000.0
111989,229880,venta casa rincón de las lomas,fraccionamiento cerrado con vigilancia \nrecié...,Casa,RINCON DE LAS LOMAS,Cuajimalpa de Morelos,Distrito Federal,20.0,3.0,2.0,...,23751.0,,,2014-10-29,0.0,0.0,0.0,1.0,1.0,5800000.0
179272,86670,somos los unicos en tizayuca con casas solas d...,somos la empresa con la casa con mas beneficio...,Casa,EL CARMEN TIZAYUCA,Tizayuca,Hidalgo,0.0,3.0,1.0,...,46800.0,,,2015-03-17,1.0,0.0,0.0,1.0,1.0,363000.0
72362,135983,casa en condominio en venta en centro sur,"-- rcv141010-fm-27 -- ,bonita casa ubicada ...",Casa en condominio,"AV. FRAY LUIS DE LEON, CONDOMINIO 1 1",Querétaro,Querétaro,10.0,3.0,2.0,...,83764.0,,,2015-11-01,0.0,0.0,0.0,0.0,0.0,1760000.0


In [186]:
train_sample = train_sample.sort_values(by = 'fecha')

In [187]:
X = train_sample.drop(labels=['precio'], axis=1)
y = train_sample.precio

In [188]:
X.shape

(10000, 22)

In [189]:
len(y)

10000

In [201]:
value = int(2)
value

2

In [206]:
tss = TimeSeriesSplit(n_splits = value)

In [207]:
# This was the trickiest part as a newbie. Straight from the docs
# If you only have experience with CV splits this way
# of making the splits might seem foreign. Fret not.
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [208]:
X_train.shape

(6667, 22)

In [209]:
y_train.shape

(6667,)

In [213]:
y_test.shape

(3333,)

In [212]:
X_test.shape

(3333, 22)

In [214]:
X_train.fecha.dt.year.value_counts()

2015    2189
2014    1699
2013    1215
2012     964
2016     600
Name: fecha, dtype: int64

In [216]:
X_train[X_train.fecha.dt.year == 2016].fecha.dt.month.value_counts().sort_index(ascending = False)

3    195
2    163
1    242
Name: fecha, dtype: int64

In [218]:
X_test.fecha.dt.year.value_counts()

2016    3333
Name: fecha, dtype: int64

In [217]:
X_test.fecha.dt.month.value_counts()

12    1204
4      285
6      268
10     265
11     261
7      256
9      255
8      250
5      240
3       49
Name: fecha, dtype: int64

In [138]:
X_train.shape

(6667, 22)

In [139]:
y_train.shape

(6667,)

In [140]:
len(X_test)/10000

0.3333