# Pyber Challenge

### 4.3 Loading and Reading CSV files

In [28]:
# Add Matplotlib inline magic command
%matplotlib inline
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as sts

# File to Load (Remember to change these)
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

# Read the City and Ride Data
city_data_df = pd.read_csv(city_data_to_load)
ride_data_df = pd.read_csv(ride_data_to_load)

### Merge the DataFrames

In [2]:
# Combine the data into a single dataset
pyber_data_df = pd.merge(ride_data_df, city_data_df, how="left", on=["city", "city"])

# Display the data table for preview
pyber_data_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344,46,Urban


In [3]:
urban_cities_df = pyber_data_df[pyber_data_df["type"] =="Urban"]
suburban_cities_df = pyber_data_df[pyber_data_df["type"] == "Suburban"]
rural_cities_df = pyber_data_df[pyber_data_df["type"] == "Rural"]

## Deliverable 1: Get a Summary DataFrame 

In [4]:
#  1. Get the total rides for each city type
urban_ride_count = urban_cities_df.groupby(["city"]).count()["ride_id"]
urban_ride_count.head()


city
Amandaburgh        18
Barajasview        22
Carriemouth        27
Christopherfurt    27
Deanville          19
Name: ride_id, dtype: int64

In [5]:
suburban_ride_count = suburban_cities_df.groupby(["city"]).count()["ride_id"]
suburban_ride_count.head()

city
Barronchester    16
Bethanyland      18
Brandonfort      19
Colemanland      22
Davidfurt        17
Name: ride_id, dtype: int64

In [6]:
rural_ride_count = rural_cities_df.groupby(["city"]).count()["ride_id"]
rural_ride_count.head()

city
Bradshawfurt      10
Garzaport          3
Harringtonfort     6
Jessicaport        6
Lake Jamie         6
Name: ride_id, dtype: int64

In [7]:
# 2. Get the total drivers for each city type
urban_driver_sum = urban_cities_df.groupby(["city"]).sum()["driver_count"]
urban_driver_sum.head()

city
Amandaburgh         216
Barajasview         572
Carriemouth        1404
Christopherfurt    1107
Deanville           931
Name: driver_count, dtype: int64

In [8]:
suburban_driver_sum = suburban_cities_df.groupby(["city"]).sum()["driver_count"]
suburban_driver_sum.head()

city
Barronchester    176
Bethanyland      396
Brandonfort      190
Colemanland      506
Davidfurt        391
Name: driver_count, dtype: int64

In [9]:
rural_driver_sum = rural_cities_df.groupby(["city"]).sum()["driver_count"]
rural_driver_sum.head()

city
Bradshawfurt      70
Garzaport         21
Harringtonfort    24
Jessicaport        6
Lake Jamie        24
Name: driver_count, dtype: int64

In [10]:
#  3. Get the total amount of fares for each city type
urban_sum_fare = urban_cities_df.groupby(["city"]).sum()["fare"]
urban_sum_fare.head()

city
Amandaburgh        443.55
Barajasview        557.31
Carriemouth        764.49
Christopherfurt    661.55
Deanville          491.01
Name: fare, dtype: float64

In [11]:
suburban_sum_fare = suburban_cities_df.groupby(["city"]).sum()["fare"]
suburban_sum_fare.head()

city
Barronchester    582.76
Bethanyland      593.21
Brandonfort      673.31
Colemanland      679.68
Davidfurt        543.93
Name: fare, dtype: float64

In [12]:
rural_sum_fare = rural_cities_df.groupby(["city"]).sum()["fare"]
rural_sum_fare.head()

city
Bradshawfurt      400.64
Garzaport          72.37
Harringtonfort    200.82
Jessicaport       216.08
Lake Jamie        206.15
Name: fare, dtype: float64

In [13]:
#  4. Get the average fare per ride for each city type. 
urban_ride_avg = urban_cities_df.groupby(["city"]).mean()["ride_id"]
urban_ride_avg.head()

city
Amandaburgh        5.359336e+12
Barajasview        4.676968e+12
Carriemouth        5.184855e+12
Christopherfurt    4.582307e+12
Deanville          4.901508e+12
Name: ride_id, dtype: float64

In [14]:
suburban_ride_avg = suburban_cities_df.groupby(["city"]).mean()["ride_id"]
suburban_ride_avg.head()

city
Barronchester    4.460796e+12
Bethanyland      5.842550e+12
Brandonfort      5.429539e+12
Colemanland      5.155714e+12
Davidfurt        4.575051e+12
Name: ride_id, dtype: float64

In [15]:
rural_ride_avg = rural_cities_df.groupby(["city"]).mean()["ride_id"]
rural_ride_avg.head()

city
Bradshawfurt      3.735294e+12
Garzaport         5.949150e+12
Harringtonfort    5.726406e+12
Jessicaport       3.622801e+12
Lake Jamie        3.558885e+12
Name: ride_id, dtype: float64

In [16]:
# 5. Get the average fare per driver for each city type. 
urban_avg_fare = urban_cities_df.groupby(["city"]).mean()["fare"]
urban_avg_fare.head()

city
Amandaburgh        24.641667
Barajasview        25.332273
Carriemouth        28.314444
Christopherfurt    24.501852
Deanville          25.842632
Name: fare, dtype: float64

In [17]:
suburban_avg_fare = suburban_cities_df.groupby(["city"]).mean()["fare"]
suburban_avg_fare.head()

city
Barronchester    36.422500
Bethanyland      32.956111
Brandonfort      35.437368
Colemanland      30.894545
Davidfurt        31.995882
Name: fare, dtype: float64

In [18]:
rural_avg_fare = rural_cities_df.groupby(["city"]).mean()["fare"]
rural_avg_fare.head()

city
Bradshawfurt      40.064000
Garzaport         24.123333
Harringtonfort    33.470000
Jessicaport       36.013333
Lake Jamie        34.358333
Name: fare, dtype: float64

In [19]:
#city summaries
urban_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,1625.0,1625.0,1625.0
mean,24.525772,4873485000000.0,36.678154
std,11.738649,2907440000000.0,20.075545
min,4.05,14588100000.0,3.0
25%,14.55,2400244000000.0,22.0
50%,24.64,4711188000000.0,37.0
75%,34.58,7451579000000.0,52.0
max,44.97,9991538000000.0,73.0


In [20]:
suburban_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,625.0,625.0,625.0
mean,30.970128,4971015000000.0,13.712
std,10.677508,2912410000000.0,8.042372
min,12.05,321833700.0,1.0
25%,21.97,2364253000000.0,5.0
50%,30.75,5053221000000.0,16.0
75%,39.83,7551674000000.0,21.0
max,49.96,9917744000000.0,25.0


In [21]:
rural_cities_df.describe()

Unnamed: 0,fare,ride_id,driver_count
count,125.0,125.0,125.0
mean,34.62344,4647746000000.0,4.296
std,14.558046,2887834000000.0,2.691156
min,10.11,26848730000.0,1.0
25%,19.9,2275364000000.0,1.0
50%,37.05,4023962000000.0,4.0
75%,47.07,7118047000000.0,7.0
max,58.55,9990581000000.0,9.0


In [22]:
import numpy as np

In [23]:
#  6. Create a PyBer summary DataFrame. 

pyber_summary_df = pd.DataFrame({
    ["Urban": urban_cities_df,
     "Suburban": suburban_cities_df,
     "Rural": rural_cities_df]})

pyber_summary_df.head()

SyntaxError: invalid syntax (Temp/ipykernel_10016/3113205001.py, line 4)

In [24]:
#  7. Cleaning up the DataFrame. Delete the index name
pyber_summary_df.index.name = None

NameError: name 'pyber_summary_df' is not defined

In [10]:
#  8. Format the columns.


## Deliverable 2.  Create a multiple line plot that shows the total weekly of the fares for each type of city.

In [26]:
# 1. Read the merged DataFrame


In [12]:
# 2. Using groupby() to create a new DataFrame showing the sum of the fares 
#  for each date where the indices are the city type and date.


In [13]:
# 3. Reset the index on the DataFrame you created in #1. This is needed to use the 'pivot()' function.
# df = df.reset_index()


In [14]:
# 4. Create a pivot table with the 'date' as the index, the columns ='type', and values='fare' 
# to get the total fares for each type of city by the date. 


In [15]:
# 5. Create a new DataFrame from the pivot table DataFrame using loc on the given dates, '2019-01-01':'2019-04-29'.



In [16]:
# 6. Set the "date" index to datetime datatype. This is necessary to use the resample() method in Step 8.
# df.index = pd.to_datetime(df.index)

In [17]:
# 7. Check that the datatype for the index is datetime using df.info()


In [18]:
# 8. Create a new DataFrame using the "resample()" function by week 'W' and get the sum of the fares for each week.


In [19]:
# 8. Using the object-oriented interface method, plot the resample DataFrame using the df.plot() function. 

# Import the style from Matplotlib.
from matplotlib import style
# Use the graph style fivethirtyeight.
style.use('fivethirtyeight')

