# Intento de modelado 1

## Elección de modelo

### KNNeighbors Regression



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
import swifter

%matplotlib inline
%config IPCompleter.greedy = True

In [2]:
df = pd.read_csv('NuevoCSV2.csv')

In [3]:
df.dtypes

fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
distance             float64
dtype: object

In [4]:
df.head(n=4)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1,94.358403
1,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,796.956699
2,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2,136.740367
3,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1,253.40473


In [None]:
df.info(memory_usage='deep')

In [None]:
for dtype in ['float','int','object']:
    selected_dtype = df.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

In [6]:
# recuperamos las fechas en formato %m-%d 
# con esto reducimos cardinalidad aunque nos arriesgamos a generar error dado 
# que las fechas se distribuyen a lo largo de los años y el precio puede variar
# con el tiempo, no solo con la temporada
def fechero(fecha):
    x=fecha[5:10]
    return x

Vec = df.filter(['pickup_datetime'])
Vec

Unnamed: 0,pickup_datetime
0,2009-06-15 17:26:21+00:00
1,2010-01-05 16:52:16+00:00
2,2011-08-18 00:35:00+00:00
3,2012-04-21 04:30:42+00:00
4,2010-03-09 07:51:00+00:00
...,...
53717845,2014-03-15 03:28:00+00:00
53717846,2009-03-24 20:46:20+00:00
53717847,2011-04-02 22:04:24+00:00
53717848,2011-10-26 05:57:51+00:00


In [7]:
Vec['pickup_datetime'] = Vec['pickup_datetime'].swifter.apply(fechero)
Vec

Pandas Apply:   0%|          | 0/53717850 [00:00<?, ?it/s]

Unnamed: 0,pickup_datetime
0,06-15
1,01-05
2,08-18
3,04-21
4,03-09
...,...
53717845,03-15
53717846,03-24
53717847,04-02
53717848,10-26


In [None]:
Vec['pickup_datetime'].unique()

Con la celda anterior podemos percatarnos de que están los 366 días existentes del año 

A continuación exploraremos si los datos de las fechas no están desequilibrados

In [None]:
aux = Vec.groupby(['pickup_datetime'])
print('Cantidad promedio de fechas por dato: ',aux.size().mean())
print('Mínimos: ',aux.size().min())
print('Máximos: ',aux.size().max())
print('#################################')
print(aux.size())

In [8]:
import warnings
warnings.filterwarnings('ignore')

df.insert(8,"dates",Vec['pickup_datetime'],True)
print(df)

          fare_amount            pickup_datetime  pickup_longitude  \
0                 4.5  2009-06-15 17:26:21+00:00        -73.844311   
1                16.9  2010-01-05 16:52:16+00:00        -74.016048   
2                 5.7  2011-08-18 00:35:00+00:00        -73.982738   
3                 7.7  2012-04-21 04:30:42+00:00        -73.987130   
4                 5.3  2010-03-09 07:51:00+00:00        -73.968095   
...               ...                        ...               ...   
53717845         14.0  2014-03-15 03:28:00+00:00        -74.005272   
53717846          4.2  2009-03-24 20:46:20+00:00        -73.957784   
53717847         14.1  2011-04-02 22:04:24+00:00        -73.970505   
53717848         28.9  2011-10-26 05:57:51+00:00        -73.980901   
53717849          7.5  2014-12-12 11:33:00+00:00        -73.969722   

          pickup_latitude  dropoff_longitude  dropoff_latitude  \
0               40.721319         -73.841610         40.712278   
1               40.711303  

In [9]:
df.shape

(53717850, 9)

In [10]:
df.head(4)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,dates
0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1,94.358403,06-15
1,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,796.956699,01-05
2,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2,136.740367,08-18
3,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1,253.40473,04-21


In [11]:
df= df.drop(['pickup_datetime'], axis=1)

In [12]:
df.shape

(53717850, 8)

In [13]:
df.head(4)

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,dates
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,94.358403,06-15
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,796.956699,01-05
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,136.740367,08-18
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,253.40473,04-21


In [14]:
ruta = "./dataframe.csv"
df.to_csv(ruta, index=False)