### Setup

In [110]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [111]:
DATA_PATH = "uber.csv"

### Loading In Data

In [112]:
def load_uber_data(data_path=DATA_PATH):
    return pd.read_csv(data_path)

### Cleaning Data

In [113]:
data = load_uber_data()

In [114]:
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [115]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [116]:
imputer = SimpleImputer(strategy="median")
data_num = data.drop("key", axis=1)
data_num = data_num.drop("pickup_datetime", axis=1)
imputer.fit(data_num)

SimpleImputer(strategy='median')

In [117]:
X = imputer.transform(data_num)
data_clean = pd.DataFrame(X, columns=data_num.columns,
                          index=data_num.index)
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  float64
 1   fare_amount        200000 non-null  float64
 2   pickup_longitude   200000 non-null  float64
 3   pickup_latitude    200000 non-null  float64
 4   dropoff_longitude  200000 non-null  float64
 5   dropoff_latitude   200000 non-null  float64
 6   passenger_count    200000 non-null  float64
dtypes: float64(7)
memory usage: 10.7 MB


### Ordinal Encoder 

In [118]:
data_cat = data[["key","pickup_datetime"]]
data_cat.head(10)

Unnamed: 0,key,pickup_datetime
0,2015-05-07 19:52:06.0000003,2015-05-07 19:52:06 UTC
1,2009-07-17 20:04:56.0000002,2009-07-17 20:04:56 UTC
2,2009-08-24 21:45:00.00000061,2009-08-24 21:45:00 UTC
3,2009-06-26 08:22:21.0000001,2009-06-26 08:22:21 UTC
4,2014-08-28 17:47:00.000000188,2014-08-28 17:47:00 UTC
5,2011-02-12 02:27:09.0000006,2011-02-12 02:27:09 UTC
6,2014-10-12 07:04:00.0000002,2014-10-12 07:04:00 UTC
7,2012-12-11 13:52:00.00000029,2012-12-11 13:52:00 UTC
8,2012-02-17 09:32:00.00000043,2012-02-17 09:32:00 UTC
9,2012-03-29 19:06:00.000000273,2012-03-29 19:06:00 UTC


In [119]:
ordinal_encoder = OrdinalEncoder()
data_cat_encoded = ordinal_encoder.fit_transform(data_cat)
data_cat_encoded[:10]

array([[195992., 192625.],
       [ 16590.,  16295.],
       [ 19677.,  19335.],
       [ 14916.,  14649.],
       [176171., 172982.],
       [ 64132.,  62968.],
       [179789., 176538.],
       [123310., 121114.],
       [ 96971.,  95232.],
       [100726.,  98922.]])

In [120]:
ordinal_encoder.categories_

[array(['2009-01-01 01:15:22.0000006', '2009-01-01 01:59:17.0000001',
        '2009-01-01 02:05:03.0000003', ..., '2015-06-30 23:31:06.0000002',
        '2015-06-30 23:33:33.0000002', '2015-06-30 23:40:39.0000001'],
       dtype=object),
 array(['2009-01-01 01:15:22 UTC', '2009-01-01 01:59:17 UTC',
        '2009-01-01 02:05:03 UTC', ..., '2015-06-30 23:31:06 UTC',
        '2015-06-30 23:33:33 UTC', '2015-06-30 23:40:39 UTC'], dtype=object)]

### One Hot Encoder

In [121]:
cat_encoder = OneHotEncoder()
data_cat_1hot = cat_encoder.fit_transform(data_cat)
data_cat_1hot

<200000x396629 sparse matrix of type '<class 'numpy.float64'>'
	with 400000 stored elements in Compressed Sparse Row format>

In [122]:
data_cat_1hot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [123]:
cat_encoder.categories_

[array(['2009-01-01 01:15:22.0000006', '2009-01-01 01:59:17.0000001',
        '2009-01-01 02:05:03.0000003', ..., '2015-06-30 23:31:06.0000002',
        '2015-06-30 23:33:33.0000002', '2015-06-30 23:40:39.0000001'],
       dtype=object),
 array(['2009-01-01 01:15:22 UTC', '2009-01-01 01:59:17 UTC',
        '2009-01-01 02:05:03 UTC', ..., '2015-06-30 23:31:06 UTC',
        '2015-06-30 23:33:33 UTC', '2015-06-30 23:40:39 UTC'], dtype=object)]