# Exploratory Data Analysis

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster
from math import sin, cos, sqrt, atan2, radians
from folium.plugins import MousePosition

The dataset contains the following fields:

- key - a unique identifier for each trip
- fare_amount - the cost of each trip in usd
- pickup_datetime - date and time when the meter was engaged
- passenger_count - the number of passengers in the vehicle (driver entered value)
- pickup_longitude - the longitude where the meter was engaged
- pickup_latitude - the latitude where the meter was engaged
- dropoff_longitude - the longitude where the meter was disengaged
- dropoff_latitude - the latitude where the meter was disengaged

In [None]:
def unique_values(dataframe):
    column_list = dataframe.columns.values.tolist()
    dict = {}
    for column in column_list:
        dict[column] = str(len(list(dataframe[str(column)].unique())))
        #print("Column: "+ column + " => " + str(len(list(dataframe[str(column)].unique()))))
    return pd.DataFrame.from_dict(dict,orient="index",columns=(["count"]))

def null_values(dataframe):
    column_list = dataframe.columns.values.tolist()
    dict = {}
    for column in column_list:
        if dataframe[str(column)].isnull().sum() > 0:
            dict[column] = dataframe[str(column)].isnull().sum()
            percentage_missing = dataframe[str(column)].isnull().sum()/len(dataframe)*100
            dict[column] = np.append(dict[column], percentage_missing)
    return pd.DataFrame.from_dict(dict,orient="index", columns = ['absolute', 'percentage'])

def data_types(df):
    return pd.DataFrame(df.dtypes.value_counts(),columns=(["count"]))

def calculate_distance(lat1, lon1, lat2, lon2):
    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

In [None]:
df = pd.read_csv("uber.csv")
df.drop(["Unnamed: 0","key"],axis=1,inplace=True) # Columns are not relevant to any prediction
df.head()

In [None]:
df.pickup_datetime = pd.to_datetime(df["pickup_datetime"])
df["year"] = df.pickup_datetime.apply(lambda x: x.year)
df["month"] = df.pickup_datetime.apply(lambda x: x.month)
df["day"] = df.pickup_datetime.apply(lambda x: x.day)
df["hour"] = df.pickup_datetime.apply(lambda x: x.hour)
df["minute"] = df.pickup_datetime.apply(lambda x: x.minute)
df["second"] = df.pickup_datetime.apply(lambda x: x.second)

In [None]:
len(df)

In [None]:
data_types(df)

In [None]:
unique_values(df)

In [None]:
null_values(df)

In [None]:
start_location = df.loc[0][["pickup_latitude","pickup_longitude"]]

latitudes = df.pickup_latitude.tolist()
longitudes = df.pickup_longitude.tolist()

locations = zip(latitudes,longitudes)

In [None]:
marker_cluster = MarkerCluster()
map = folium.Map(location=start_location, zoom_start=20)

In [None]:
#for marker in locations:
    #Coordinates = [marker[0],marker[1]]
    #print(Coordinates)

In [None]:
for marker in locations:

    Coordinates = [marker[0],marker[1]]
    marker = folium.Marker(Coordinates)
    marker_cluster.add_child(marker)
    

formatter = "function(num) {return L.Util.formatNum(num, 5);};"
mouse_position = MousePosition(
    position='topright',
    separator=' Long: ',
    empty_string='NaN',
    lng_first=False,
    num_digits=20,
    prefix='Lat:',
    lat_formatter=formatter,
    lng_formatter=formatter
)

map.add_child(mouse_position)
map.add_child(marker_cluster)
map

### From the above map:
- It is likely that the above dataset primarily consists of uber rides in NYC
- Several Pick-up Locations are at unlikely locations (e.g. in the ocean) => Likely Errors
- Only Uber rides that start in NYC and have a plausible Destination shall be considered for this project

In [None]:
min_latitude = 40.50
max_latitude = 41.20
min_longitude = -74.55
max_longitude = -73.05

In [None]:
df1 = df[(df["pickup_latitude"] > min_latitude) & 
         (df["pickup_latitude"] < max_latitude) & 
         (df["pickup_longitude"] > min_longitude) & 
         (df["pickup_longitude"] < max_longitude)
        ]

In [None]:
df1

In [None]:
start_location = df.loc[0][["pickup_latitude","pickup_longitude"]]

latitudes = df1.pickup_latitude.tolist()
longitudes = df1.pickup_longitude.tolist()

locations = zip(latitudes,longitudes)

In [None]:
marker_cluster = MarkerCluster()
map = folium.Map(location=start_location, zoom_start=20)

In [None]:
for marker in locations:

    Coordinates = [marker[0],marker[1]]
    marker = folium.Marker(Coordinates)
    marker_cluster.add_child(marker)
    

formatter = "function(num) {return L.Util.formatNum(num, 5);};"
mouse_position = MousePosition(
    position='topright',
    separator=' Long: ',
    empty_string='NaN',
    lng_first=False,
    num_digits=20,
    prefix='Lat:',
    lat_formatter=formatter,
    lng_formatter=formatter
)

map.add_child(mouse_position)
map.add_child(marker_cluster)
map