# Exploratory Data Analysis

### Import Libraries

In [33]:
import pandas as pd
import numpy as np
import folium

The dataset contains the following fields:

- key - a unique identifier for each trip
- fare_amount - the cost of each trip in usd
- pickup_datetime - date and time when the meter was engaged
- passenger_count - the number of passengers in the vehicle (driver entered value)
- pickup_longitude - the longitude where the meter was engaged
- pickup_latitude - the latitude where the meter was engaged
- dropoff_longitude - the longitude where the meter was disengaged
- dropoff_latitude - the latitude where the meter was disengaged

In [13]:
def unique_values(dataframe):
    column_list = dataframe.columns.values.tolist()
    dict = {}
    for column in column_list:
        dict[column] = str(len(list(dataframe[str(column)].unique())))
        #print("Column: "+ column + " => " + str(len(list(dataframe[str(column)].unique()))))
    return pd.DataFrame.from_dict(dict,orient="index",columns=(["count"]))

def null_values(dataframe):
    column_list = dataframe.columns.values.tolist()
    dict = {}
    for column in column_list:
        if dataframe[str(column)].isnull().sum() > 0:
            dict[column] = dataframe[str(column)].isnull().sum()
            percentage_missing = dataframe[str(column)].isnull().sum()/len(dataframe)*100
            dict[column] = np.append(dict[column], percentage_missing)
    return pd.DataFrame.from_dict(dict,orient="index", columns = ['absolute', 'percentage'])

def data_types(df):
    return pd.DataFrame(df.dtypes.value_counts(),columns=(["count"]))

In [12]:
df = pd.read_csv("uber.csv")
df.drop(["Unnamed: 0","key"],axis=1,inplace=True) # Columns are not relevant to any prediction
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [29]:
df.pickup_datetime = pd.to_datetime(df["pickup_datetime"])
df["year"] = df.pickup_datetime.apply(lambda x: x.year)
df["month"] = df.pickup_datetime.apply(lambda x: x.month)
df["day"] = df.pickup_datetime.apply(lambda x: x.day)
df["hour"] = df.pickup_datetime.apply(lambda x: x.hour)
df["minute"] = df.pickup_datetime.apply(lambda x: x.minute)
df["second"] = df.pickup_datetime.apply(lambda x: x.second)

In [30]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,hour,minute,second
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,19,52,6
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2009,7,17,20,4,56
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,2009,8,24,21,45,0
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,8,22,21
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,17,47,0


In [31]:
data_types(df)

Unnamed: 0,count
int64,7
float64,5
"datetime64[ns, UTC]",1


In [10]:
unique_values(df)

Unnamed: 0,count
fare_amount,1244
pickup_datetime,196629
pickup_longitude,71066
pickup_latitude,83835
dropoff_longitude,76895
dropoff_latitude,90586
passenger_count,8


In [51]:
test = df.loc[0][["pickup_longitude","pickup_latitude"]]

In [11]:
null_values(df)

Unnamed: 0,absolute,percentage
dropoff_longitude,1.0,0.0005
dropoff_latitude,1.0,0.0005


In [59]:
location = df.loc[0][["pickup_latitude","pickup_longitude"]].tolist()

In [66]:
map = folium.Map(location=location, zoom_start=12)

tooltip = "Click me!"

folium.Marker(
    location, popup="<i>Pick-up Location</i>", tooltip=tooltip
).add_to(map)

map