# 1. Import Data into Pandas DataFrames

In [9]:
# Dependencies and setup
import matplotlib.pyplot as plt
import pandas as pd

In [10]:
# Files to load
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [11]:
city_data_df = pd.read_csv(city_data_to_load)
city_data_df

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban
...,...,...,...
115,Bradshawfurt,7,Rural
116,New Ryantown,2,Rural
117,Randallchester,9,Rural
118,Jessicaport,1,Rural


In [14]:
ride_data_df = pd.read_csv(ride_data_to_load)
ride_data_df

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344
...,...,...,...,...
2370,Michaelberg,2019-04-29 17:04:39,13.38,8550365057598
2371,Lake Latoyabury,2019-01-30 00:05:47,20.76,9018727594352
2372,North Jaime,2019-02-10 21:03:50,11.11,2781339863778
2373,West Heather,2019-05-07 19:22:15,44.94,4256853490277


# 2. Exploring Data

## Inspecting City Data DataFrame

In [15]:
# 1. Get the count of non-missing values for each variable
city_data_df.count()

city            120
driver_count    120
type            120
dtype: int64

In [17]:
# Get the count of missing values for each variable
city_data_df.isnull().sum()

city            0
driver_count    0
type            0
dtype: int64

In [18]:
# 2. Get the data types of each column in the DataFrame
city_data_df.dtypes

city            object
driver_count     int64
type            object
dtype: object

In [22]:
# Check the unique types of city
city_type = set(city_data_df["type"])
city_type

# or
# city_data_df["type"].unique()

{'Rural', 'Suburban', 'Urban'}

In [29]:
# 3. Get the number of data points for each city type
city_data_df.groupby(["type"]).count()["city"]

type
Rural       18
Suburban    36
Urban       66
Name: city, dtype: int64

## Inspecting Ride Data DataFrame

In [30]:
# 1. Get the count of non-missing values for each variable
ride_data_df.count()

city       2375
date       2375
fare       2375
ride_id    2375
dtype: int64

In [31]:
# Get the count of missing values for each variable
ride_data_df.isnull().sum()

city       0
date       0
fare       0
ride_id    0
dtype: int64

In [33]:
# 2. Get the data types of each column in the DataFrame
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id      int64
dtype: object

In [None]:
# 3. Get the number of data points for each city type
city_ride_df.groupby(["type"]).count()["city"]

## Checking Unique ID Values in Both DataFrames
Before merging, it is important to make sure that the common variables on which the merge is to be done contains the same name for the same observation. That is, we need to make sure that the name of the cities are the same in both datasets, that there are no misspellings or different spelling for the same city.

In [37]:
# Inspect the unique values for the city

city_data_unique_id = set(city_data_df["city"])
ride_data_unique_id = set(ride_data_df["city"])

In [38]:
city_data_unique_id - ride_data_unique_id

set()

In [39]:
ride_data_unique_id - city_data_unique_id

set()

# Merging DataFrames
After making sure that the data in both datasets is clean and the unique merge id variables match, we can proceed with the merge. 

In [43]:
# Merge the data into a single dataset
pyber_data_df = pd.merge(ride_data_df, city_data_df, how="left", on = ["city", "city"])
pyber_data_df.head()
# By merging from the left, we are making sure that all cities from the ride share data would stay 
# in the combined dataset, even those cities that do not exist in the city data (hypothetically speaking).

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344,46,Urban
