# 5.2.3 Load the CSV files

### Import the Pandas and Matplotlib libraries with the Pyplot module

In [46]:
# Add Matplotlib inline magic command
%matplotlib inline
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd

### Declare variables that connect to the CSV files in the Resources folder:

In [60]:
# Files to load Using Relative path
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [61]:
# Read the city data file and store it in a pandas DataFrame.
city_data_df = pd.read_csv(city_data_to_load)
city_data_df.head(10)

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban
5,West Anthony,70,Urban
6,West Angela,48,Urban
7,Martinezhaven,25,Urban
8,Karenberg,22,Urban
9,Barajasview,26,Urban


In [62]:
# using ios
import os
#import csv
#/Users/Fabiola/Desktop/Data/week5MathPlotLib/class/PyBer_Analysis/Resources

In [63]:
# Get the current directory
cwd = os.getcwd()
cwd

'/Users/Fabiola/Desktop/Data/week5MathPlotLib/class/PyBer_Analysis'

In [64]:
# Files to load
city_data_to_load = os.path.join("Resources", "city_data.csv")
ride_data_to_load = os.path.join("Resources", "ride_data.csv")
city_data_df = pd.read_csv(city_data_to_load)
ride_data_df = pd.read_csv(ride_data_to_load)

In [67]:
# # Read the city data file and store it in a pandas DataFrame.
city_data_df.head(10)

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban
5,West Anthony,70,Urban
6,West Angela,48,Urban
7,Martinezhaven,25,Urban
8,Karenberg,22,Urban
9,Barajasview,26,Urban


In [68]:
## Read the ride data file and store it in a pandas DataFrame.
ride_data_df.head(10)

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344
5,South Latoya,2019-03-11 12:26:48,9.52,1994999424437
6,New Paulville,2019-02-27 11:17:56,43.25,793208410091
7,Simpsonburgh,2019-04-26 00:43:24,35.98,111953927754
8,South Karenland,2019-01-08 03:28:48,35.09,7995623208694
9,North Jasmine,2019-03-09 06:26:29,42.81,5327642267789


# 5.2.4 Explore the Data in Pandas

### Inspect the City Data DataFrame
For the city_data_df DataFrame, we need to:

 * Get all the rows that contain null values.
 * Make sure the driver_count column has an integer data type.
 * Find out how many data points there are for each type of city.


In [72]:
#to find the names of our columns and the number of rows that are not null.
city_data_df.count()

city            120
driver_count    120
type            120
dtype: int64

In [71]:
#Another way to count the number of null values.
# To find the names of our columns and the number of rows that are not null.

city_data_df.isnull().sum()

city            0
driver_count    0
type            0
dtype: int64

In [73]:
# Get the data types of each column.
city_data_df.dtypes

city            object
driver_count     int64
type            object
dtype: object

In [74]:
#Finally, we'll check to see how many data points there are for each type of city. 
#To do this, we'll use the sum() method on the city_data_df for the type column where the condition equals each city 
#in the DataFrame.

# Get the unique values of the type of city.
city_data_df["type"].unique()

array(['Urban', 'Suburban', 'Rural'], dtype=object)

In [75]:
# Get the number of data points from the Urban cities.
sum(city_data_df["type"]=="Urban")

66

In [76]:
# Get the number of data points from the Suburban cities.
sum(city_data_df["type"]=="Suburban")

36

In [78]:
# Get the number of data points from the Rural cities.
sum(city_data_df["type"]=="Rural")

18

# Inspect Ride Data DataFrame

## For the ride_data_df DataFrame, we need to:
1. Get all the rows that contain null values.
2. Make sure the fare and ride_id columns are numerical data types.

In [79]:
# Get the columns and the rows that are not null.
ride_data_df.count()

city       2375
date       2375
fare       2375
ride_id    2375
dtype: int64

In [80]:
# Get the columns and the rows that are not null.
ride_data_df.isnull().sum()

city       0
date       0
fare       0
ride_id    0
dtype: int64

In [81]:
# Get the data types of each column. We need to know if we can perform mathematical operations.
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id      int64
dtype: object

# Merge DataFrames

### Before we merge the DataFrames, let's review each DataFrame.
The columns in the city_data_df DataFrame are:

city

driver_count

type

The columns in the ride_data_df are:

city

date

fare

ride_id

In [83]:
# When we merge two DataFrames, we merge on a column with the same data, and the same column name, 
# in both DataFrames. 
# The how= parameter either left, right, inner, or outer depending how we want to merge the DataFrames.

In [84]:
# Combine the data into a single dataset
pyber_data_df = pd.merge(ride_data_df, city_data_df, how="left", on=["city", "city"])

# Display the DataFrame
pyber_data_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344,46,Urban


# 5.2.5 Commit Your Code