### Setup

In [139]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import geopy.distance
from sklearn.base import BaseEstimator, TransformerMixin

In [140]:
DATA_PATH = "uber.csv"

### Loading In Data

In [141]:
def load_uber_data(data_path=DATA_PATH):
    return pd.read_csv(data_path)

### Cleaning Data

In [142]:
data = load_uber_data()

In [143]:
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [144]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [145]:
imputer = SimpleImputer(strategy="median")
data_num = data.drop("key", axis=1)
data_num = data_num.drop("pickup_datetime", axis=1)
imputer.fit(data_num)

SimpleImputer(strategy='median')

In [146]:
X = imputer.transform(data_num)
data_clean = pd.DataFrame(X, columns=data_num.columns,
                          index=data_num.index)
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  float64
 1   fare_amount        200000 non-null  float64
 2   pickup_longitude   200000 non-null  float64
 3   pickup_latitude    200000 non-null  float64
 4   dropoff_longitude  200000 non-null  float64
 5   dropoff_latitude   200000 non-null  float64
 6   passenger_count    200000 non-null  float64
dtypes: float64(7)
memory usage: 10.7 MB


### Ordinal Encoder 

In [147]:
data_cat = data[["key","pickup_datetime"]]
data_cat.head(10)

Unnamed: 0,key,pickup_datetime
0,2015-05-07 19:52:06.0000003,2015-05-07 19:52:06 UTC
1,2009-07-17 20:04:56.0000002,2009-07-17 20:04:56 UTC
2,2009-08-24 21:45:00.00000061,2009-08-24 21:45:00 UTC
3,2009-06-26 08:22:21.0000001,2009-06-26 08:22:21 UTC
4,2014-08-28 17:47:00.000000188,2014-08-28 17:47:00 UTC
5,2011-02-12 02:27:09.0000006,2011-02-12 02:27:09 UTC
6,2014-10-12 07:04:00.0000002,2014-10-12 07:04:00 UTC
7,2012-12-11 13:52:00.00000029,2012-12-11 13:52:00 UTC
8,2012-02-17 09:32:00.00000043,2012-02-17 09:32:00 UTC
9,2012-03-29 19:06:00.000000273,2012-03-29 19:06:00 UTC


In [148]:
ordinal_encoder = OrdinalEncoder()
data_cat_encoded = ordinal_encoder.fit_transform(data_cat)
data_cat_encoded[:10]

array([[195992., 192625.],
       [ 16590.,  16295.],
       [ 19677.,  19335.],
       [ 14916.,  14649.],
       [176171., 172982.],
       [ 64132.,  62968.],
       [179789., 176538.],
       [123310., 121114.],
       [ 96971.,  95232.],
       [100726.,  98922.]])

In [149]:
ordinal_encoder.categories_

[array(['2009-01-01 01:15:22.0000006', '2009-01-01 01:59:17.0000001',
        '2009-01-01 02:05:03.0000003', ..., '2015-06-30 23:31:06.0000002',
        '2015-06-30 23:33:33.0000002', '2015-06-30 23:40:39.0000001'],
       dtype=object),
 array(['2009-01-01 01:15:22 UTC', '2009-01-01 01:59:17 UTC',
        '2009-01-01 02:05:03 UTC', ..., '2015-06-30 23:31:06 UTC',
        '2015-06-30 23:33:33 UTC', '2015-06-30 23:40:39 UTC'], dtype=object)]

### One Hot Encoder

In [150]:
cat_encoder = OneHotEncoder()
data_cat_1hot = cat_encoder.fit_transform(data_cat)
data_cat_1hot

<200000x396629 sparse matrix of type '<class 'numpy.float64'>'
	with 400000 stored elements in Compressed Sparse Row format>

In [151]:
data_cat_1hot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [152]:
cat_encoder.categories_

[array(['2009-01-01 01:15:22.0000006', '2009-01-01 01:59:17.0000001',
        '2009-01-01 02:05:03.0000003', ..., '2015-06-30 23:31:06.0000002',
        '2015-06-30 23:33:33.0000002', '2015-06-30 23:40:39.0000001'],
       dtype=object),
 array(['2009-01-01 01:15:22 UTC', '2009-01-01 01:59:17 UTC',
        '2009-01-01 02:05:03 UTC', ..., '2015-06-30 23:31:06 UTC',
        '2015-06-30 23:33:33 UTC', '2015-06-30 23:40:39 UTC'], dtype=object)]

### Transformer

In [155]:
class calculateCostPer(BaseEstimator, TransformerMixin):
    def __init__(self, find_distance=True):
        self.find_distance = find_distance
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X["cost_per_passenger"] = (X[:,"fare_amount"]/X[:,"passenger_count"]).round(2)
        return X
        
costPer_adder = calculateCostPer(find_distance=False)
data_new = distance_adder.transform(data)

data_new

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,cost_per_passenger
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,7.50
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1,7.70
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1,12.90
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,1.77
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,3.20
...,...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1,3.00
199996,16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1,7.50
199997,27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2,15.45
199998,20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1,14.50


In [156]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', calculateCostPer()),
        ('std_scaler', StandardScaler()),
    ])

data_num_tr = num_pipeline.fit_transform(data_num)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(data_num)
cat_attribs = ["key","pickup_datetime"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

### Calculating Distance

In [130]:
# Was meant for transformer, but couldn't get it to work
from math import radians, cos, sin, asin, sqrt
def haversine(data):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    lon1 = data['pickup_longitude']
    lat1 = data['pickup_latitude']
    lon2 = data['dropoff_longitude']
    lat2 = data['dropoff_latitude']
    
    
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

#test = haversine(data.at[2, 'pickup_latitude'], data.at[2, 'pickup_longitude'], data.at[2, 'dropoff_latitude'], data.at[2, 'dropoff_longitude'])
data['distance'] = data.apply(lambda data: haversine(data), axis=1)

data


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,1.683323
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1,2.457590
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1,5.036377
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,1.661683
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,4.475450
...,...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1,0.112210
199996,16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1,1.875050
199997,27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2,12.850319
199998,20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1,3.539715
