# *Python: Advanced DataFrames*

## Pandas DataFrames Review

### Import the pandas module as `pd`

In [1]:
import pandas as pd

### Read and View data

In [2]:
pizza_df = pd.read_csv('Data/pizza_with_borough.csv')

In [3]:
pizza_df.columns

Index(['Name', 'location_lat', 'location_lng', 'Date',
       'Date Expanded (times in EST)', 'Year', 'Price as number', 'Price',
       'Style', 'borough', 'street', 'city', 'state', 'country'],
      dtype='object')

In [4]:
pizza_df.describe()

Unnamed: 0,location_lat,location_lng,Year,Price as number
count,466.0,466.0,466.0,466.0
mean,40.718921,-73.94304,2017.373391,2.685343
std,0.063688,0.076291,3.0136,0.803061
min,40.525698,-74.201928,2014.0,1.0
25%,40.686295,-73.989927,2015.0,2.25
50%,40.718706,-73.96431,2016.0,2.75
75%,40.754684,-73.908135,2021.0,3.0
max,40.903661,-73.69983,2022.0,6.53


In [6]:
pizza_df.head(3)

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country
0,Angelos Pizza,40.623254,-73.937922,2022-1014,"Oct 14th 2022, 5:57:51 pm",2022,3.0,$3.00,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States
1,Ozone Pizzeria,40.680892,-73.842631,2022-1008,"Oct 8th 2022, 6:48:59 pm",2022,3.0,$3.00,Plain,Queens,Liberty Avenue,City of New York,New York,United States
2,Pino Pizza,40.600015,-73.999455,2022-1003,"Oct 3rd 2022, 5:47:23 pm",2022,2.75,$2.75,Plain,Brooklyn,21st Avenue,City of New York,New York,United States


In [7]:
pizza_df.tail(3)

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country
463,Alphonso's Pizzeria & Trattoria - Pizza Shack,40.714338,-73.981708,2014-0810,"Aug 10th 2014, 3:03:13 pm",2014,2.25,$2.25,Plain,Manhattan,Grand Street,City of New York,New York,United States
464,Frank's Pizza,40.61772,-73.93174,2014-0809,"Aug 9th 2014, 6:17:39 pm",2014,2.55,$2.55,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States
465,Bona Pizza,40.640977,-73.95627,2014-0809,"Aug 9th 2014, 1:31:51 pm",2014,2.0,$2.00,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States


#### **Row Index Slicing**

In [8]:
pizza_df.loc[0:100:10] 

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country
0,Angelos Pizza,40.623254,-73.937922,2022-1014,"Oct 14th 2022, 5:57:51 pm",2022,3.0,$3.00,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States
10,Ciro Pizza Cafe,40.533289,-74.192775,2022-0812,"Aug 12th 2022, 9:34:49 pm",2022,2.5,$2.50,Plain,Staten Island,Huguenot Avenue,City of New York,New York,United States
20,Enzo's Pizzeria,40.684213,-73.859342,2022-0527,"May 27th 2022, 8:14:56 pm",2022,3.26,$3.26,Plain,Queens,81st Street,City of New York,New York,United States
30,Villa Rustica Ristorante & Pizzeria,40.739964,-73.758249,2022-0416,"Apr 16th 2022, 5:59:19 pm",2022,3.5,$3.50,Plain,Queens,Bell Boulevard,City of New York,New York,United States
40,Sal Pizza,40.704507,-73.919242,2022-0205,"Feb 5th 2022, 10:41:00 pm",2022,2.75,$2.75,Plain,Brooklyn,Wyckoff Avenue,City of New York,New York,United States
50,Sam's Famous Pizza 116th,40.798309,-73.941632,2022-0115,"Jan 15th 2022, 9:00:23 pm",2022,3.0,$3.00,Plain,Manhattan,East 116th Street,City of New York,New York,United States
60,Brothers pizza shop,40.86726,-73.89639,2021-1221,"Dec 21st 2021, 6:37:26 pm",2021,2.5,$2.50,Plain,The Bronx,East Kingsbridge Road,City of New York,New York,United States
70,Joe's Pizza & Pasta,40.760163,-73.731491,2021-1119,"Nov 19th 2021, 2:41:16 pm",2021,3.0,$3.00,Plain,Queens,Marathon Parkway,City of New York,New York,United States
80,La Bona Pizza & Pasta,40.6737,-73.88283,2021-1016,"Oct 16th 2021, 6:58:35 pm",2021,2.75,$2.75,Plain,Brooklyn,Pitkin Avenue,City of New York,New York,United States
90,Belmora Pizza & Restaurant,40.76095,-73.96879,2021-0811,"Aug 11th 2021, 9:32:05 pm",2021,3.2,$3.20,Plain,Manhattan,East 57th Street,City of New York,New York,United States


## **Cleaning Data**

#### **Identifying Nulls**

In [9]:
pizza_df.isnull().sum() # .sum() tells us how many Nulls there are in the dataframe by column

Name                            0
location_lat                    0
location_lng                    0
Date                            0
Date Expanded (times in EST)    0
Year                            0
Price as number                 0
Price                           0
Style                           0
borough                         3
street                          0
city                            4
state                           0
country                         0
dtype: int64

In [10]:
pizza_df.isnull()

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,False,False,False,False,False,False,False,False,False,False,False,False,False,False
462,False,False,False,False,False,False,False,False,False,False,False,False,False,False
463,False,False,False,False,False,False,False,False,False,False,False,False,False,False
464,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
sample_df = 

In [11]:
pizza_df.isnull().sum().sum() # 2 sums gives us a total aggregate of nulls

7

#### **Replacing Nulls**

In [15]:
pizza_df_nonull = pizza_df.fillna(value = 'Not in NY')

In [16]:
pizza_df_nonull.isnull().sum().sum()

0

In [18]:
pizza_df.iloc[74:75]

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country
74,Pizza Amore,40.650537,-73.701697,2021-1105,"Nov 5th 2021, 8:13:20 pm",2021,3.0,$3.00,Plain,,Dubois Avenue,,New York,United States


In [19]:
pizza_df_nonull.iloc[74:75]

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country
74,Pizza Amore,40.650537,-73.701697,2021-1105,"Nov 5th 2021, 8:13:20 pm",2021,3.0,$3.00,Plain,Not in NY,Dubois Avenue,Not in NY,New York,United States


In [21]:
pizza_df_nonull.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466 entries, 0 to 465
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          466 non-null    object 
 1   location_lat                  466 non-null    float64
 2   location_lng                  466 non-null    float64
 3   Date                          466 non-null    object 
 4   Date Expanded (times in EST)  466 non-null    object 
 5   Year                          466 non-null    int64  
 6   Price as number               466 non-null    float64
 7   Price                         466 non-null    object 
 8   Style                         466 non-null    object 
 9   borough                       466 non-null    object 
 10  street                        466 non-null    object 
 11  city                          466 non-null    object 
 12  state                         466 non-null    object 
 13  count

#### **Removing Nulls** 

In [22]:
df_dropped = pizza_df.dropna()

In [23]:
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 462 entries, 0 to 465
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          462 non-null    object 
 1   location_lat                  462 non-null    float64
 2   location_lng                  462 non-null    float64
 3   Date                          462 non-null    object 
 4   Date Expanded (times in EST)  462 non-null    object 
 5   Year                          462 non-null    int64  
 6   Price as number               462 non-null    float64
 7   Price                         462 non-null    object 
 8   Style                         462 non-null    object 
 9   borough                       462 non-null    object 
 10  street                        462 non-null    object 
 11  city                          462 non-null    object 
 12  state                         462 non-null    object 
 13  count

#### **Identifying Duplicates**

In [24]:
pizza_df.duplicated().sum()

2

#### **Extract duplicates**

In [25]:
#are there any duplicate rows? 
pizza_df.loc[pizza_df.duplicated()]  # Show duplicated rows


Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country
34,Marinara Pizza,40.729778,-73.986505,2022-0322,"Mar 22nd 2022, 7:55:54 pm",2022,4.9,$4.90,Pepperoni,Manhattan,2nd Avenue,City of New York,New York,United States
165,Slice of Brooklyn,40.65691,-74.00143,2019-0807,"Aug 7th 2019, 1:41:03 pm",2019,2.25,$2.25,Plain,Kings County,4th Avenue,City of New York,New York,United States


#### **Removing Duplicates**

In [26]:
df_nodupes = pizza_df.drop_duplicates()

In [27]:
df_nodupes.duplicated().sum()

0

# **Transforming data**

#### **Removing columns**

In [28]:
pizza_df.columns

Index(['Name', 'location_lat', 'location_lng', 'Date',
       'Date Expanded (times in EST)', 'Year', 'Price as number', 'Price',
       'Style', 'borough', 'street', 'city', 'state', 'country'],
      dtype='object')

In [29]:
df_lean = pizza_df.drop(columns =['location_lat', 'location_lng', 'country'])
df_lean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466 entries, 0 to 465
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          466 non-null    object 
 1   Date                          466 non-null    object 
 2   Date Expanded (times in EST)  466 non-null    object 
 3   Year                          466 non-null    int64  
 4   Price as number               466 non-null    float64
 5   Price                         466 non-null    object 
 6   Style                         466 non-null    object 
 7   borough                       463 non-null    object 
 8   street                        466 non-null    object 
 9   city                          462 non-null    object 
 10  state                         466 non-null    object 
dtypes: float64(1), int64(1), object(9)
memory usage: 40.2+ KB


## Search and replace

In [68]:
# What should we do with disparate borough names?

In [30]:
pizza_df.borough.unique()

array(['Brooklyn', 'Queens', 'Manhattan', 'The Bronx', 'Staten Island',
       'Kings County', nan, 'Queens County'], dtype=object)

In [32]:
pizza_df = pizza_df.replace('The Bronx','Bronx', regex=True)
pizza_df = pizza_df.replace('Queens County','Queens', regex=True)
pizza_df = pizza_df.replace('Kings County','Brooklyn', regex=True)
pizza_df.borough.unique()

array(['Brooklyn', 'Queens', 'Manhattan', 'Bronx', 'Staten Island', nan],
      dtype=object)

## Subset Dataframe


In [33]:
brooklyn_pizza= pizza_df[pizza_df['borough']=='Brooklyn']

In [34]:
brooklyn_pizza.head(3)

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country
0,Angelos Pizza,40.623254,-73.937922,2022-1014,"Oct 14th 2022, 5:57:51 pm",2022,3.0,$3.00,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States
2,Pino Pizza,40.600015,-73.999455,2022-1003,"Oct 3rd 2022, 5:47:23 pm",2022,2.75,$2.75,Plain,Brooklyn,21st Avenue,City of New York,New York,United States
7,N & D Pizza,40.600463,-73.943072,2022-0905,"Sep 5th 2022, 4:37:08 pm",2022,3.35,$3.35,Plain,Brooklyn,Avenue U,City of New York,New York,United States


In [36]:
cheap_pizza = pizza_df[ pizza_df["Price as number"] < 4]
cheap_pizza.head(10)

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country
0,Angelos Pizza,40.623254,-73.937922,2022-1014,"Oct 14th 2022, 5:57:51 pm",2022,3.0,$3.00,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States
1,Ozone Pizzeria,40.680892,-73.842631,2022-1008,"Oct 8th 2022, 6:48:59 pm",2022,3.0,$3.00,Plain,Queens,Liberty Avenue,City of New York,New York,United States
2,Pino Pizza,40.600015,-73.999455,2022-1003,"Oct 3rd 2022, 5:47:23 pm",2022,2.75,$2.75,Plain,Brooklyn,21st Avenue,City of New York,New York,United States
3,La Rondine,40.713335,-73.82941,2022-0924,"Sep 24th 2022, 6:34:19 pm",2022,3.25,$3.25,Plain,Queens,Queens Boulevard,City of New York,New York,United States
4,Rony's Fresh Pizza,40.748251,-73.99235,2022-0915,"Sep 15th 2022, 6:23:16 pm",2022,1.0,$1.00,Plain,Manhattan,West 30th Street,City of New York,New York,United States
5,John & Joe's Pizzeria,40.854562,-73.865882,2022-0909,"Sep 9th 2022, 8:48:44 pm",2022,3.5,$3.50,Plain,Bronx,Lydig Avenue,City of New York,New York,United States
6,Prego's Pizza,40.863129,-73.858511,2022-0909,"Sep 9th 2022, 5:13:31 pm",2022,3.0,$3.00,Plain,Bronx,Mace Avenue,City of New York,New York,United States
7,N & D Pizza,40.600463,-73.943072,2022-0905,"Sep 5th 2022, 4:37:08 pm",2022,3.35,$3.35,Plain,Brooklyn,Avenue U,City of New York,New York,United States
8,Peppinos,40.903661,-73.850467,2022-0818,"Aug 18th 2022, 9:23:01 pm",2022,3.0,$3.00,Plain,Bronx,East 241st Street,City of New York,New York,United States
9,Roccos Pizzeria,40.867634,-73.883605,2022-0813,"Aug 13th 2022, 8:53:26 pm",2022,3.5,$3.50,Plain,Bronx,Bedford Park Boulevard,City of New York,New York,United States


In [37]:
pizza_df['Style'].unique()

array(['Plain', 'Pepperoni', 'Stuffed Crust Plain', 'Jumbo', 'Sicilian',
       'White', 'Grandma', 'Margherita', 'Meatball'], dtype=object)

In [38]:
meal_map= {
    'Plain': 'lunch',
    'Pepperoni': 'lunch',
    'Stuffed Crust Plain': 'lunch',
    'Sicilian': 'dinner',
    'White': 'dinner',
    'Grandma': 'dinner',
    'Margherita': 'dinner',
    'Meatball': 'dinner'
}
pizza_df['Meal']=pizza_df['Style'].map(meal_map)

In [39]:
pizza_df.head(3)

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country,Meal
0,Angelos Pizza,40.623254,-73.937922,2022-1014,"Oct 14th 2022, 5:57:51 pm",2022,3.0,$3.00,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States,lunch
1,Ozone Pizzeria,40.680892,-73.842631,2022-1008,"Oct 8th 2022, 6:48:59 pm",2022,3.0,$3.00,Plain,Queens,Liberty Avenue,City of New York,New York,United States,lunch
2,Pino Pizza,40.600015,-73.999455,2022-1003,"Oct 3rd 2022, 5:47:23 pm",2022,2.75,$2.75,Plain,Brooklyn,21st Avenue,City of New York,New York,United States,lunch


In [40]:
pizza_df['Tax']=pizza_df['Price as number']*0.06

In [41]:
pizza_df.head(3)

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country,Meal,Tax
0,Angelos Pizza,40.623254,-73.937922,2022-1014,"Oct 14th 2022, 5:57:51 pm",2022,3.0,$3.00,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States,lunch,0.18
1,Ozone Pizzeria,40.680892,-73.842631,2022-1008,"Oct 8th 2022, 6:48:59 pm",2022,3.0,$3.00,Plain,Queens,Liberty Avenue,City of New York,New York,United States,lunch,0.18
2,Pino Pizza,40.600015,-73.999455,2022-1003,"Oct 3rd 2022, 5:47:23 pm",2022,2.75,$2.75,Plain,Brooklyn,21st Avenue,City of New York,New York,United States,lunch,0.165


## **Pivot Table**

Produce a “pivot” table based on column values. Uses unique values from specified index / columns to form axes of the resulting DataFrame. This function does not support data aggregation, multiple values will result in a MultiIndex in the columns. 

- `pizza_df.pivot_table()` creates a pivot table as a dataframe allowing aggregation and tabulation

[For more information about df.pivot_table() click here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pivot_table.html)

In [96]:
joined_df.head(1)

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country,Group,1950,1950 - Boro share of NYC total,2020 Population,2020 - Boro share of NYC total
0,Angelos Pizza,40.623254,-73.937922,2022-1014,"Oct 14th 2022, 5:57:51 pm",2022,3.0,$3.00,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States,Total Population,2738175,34.7,2648452,30.97


There are 2 pivot_table methods.  One is a dataframe method, the other is a more generic Pandas method

In [29]:
# This is the pandas method without columns
pivot_df = pd.pivot_table(df, index='borough', values='Price as number')      
pivot_df

Unnamed: 0_level_0,Price as number
borough,Unnamed: 1_level_1
Bronx,2.881429
Brooklyn,2.620294
Manhattan,2.726489
Queens,2.630341
Staten Island,2.519375


In [30]:
pivot_df = pd.pivot_table(df, index='borough', values='Price as number', aggfunc=('mean', 'median', 'sum', 'std', 'count'))      
pivot_df

Unnamed: 0_level_0,count,mean,median,std,sum
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bronx,35,2.881429,3.0,0.599152,100.85
Brooklyn,136,2.620294,2.75,0.671044,356.36
Manhattan,188,2.726489,2.75,1.010681,512.58
Queens,88,2.630341,2.7,0.556034,231.47
Staten Island,16,2.519375,2.5,0.552877,40.31


In [31]:
# This is the pandas method
pivot_df = pd.pivot_table(df, index='borough', values='Price as number', columns='Style')      
pivot_df

Style,Grandma,Jumbo,Margherita,Meatball,Pepperoni,Plain,Sicilian,Stuffed Crust Plain,White
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bronx,,5.0,,,4.35,2.772727,,,
Brooklyn,,,4.0,4.0,3.37,2.483333,2.5,3.5,4.0
Manhattan,4.6,5.5,3.75,,3.870811,2.387143,,,
Queens,,,,,4.0,2.582976,2.5,,
Staten Island,,,,,,2.433333,3.81,,


In [116]:
# this is the dataframe method, different syntax but same functionality
pivot_df = df.pivot_table('Price as number', index='borough', aggfunc='count')      
pivot_df

Unnamed: 0_level_0,Price as number
borough,Unnamed: 1_level_1
Bronx,35
Brooklyn,136
Manhattan,188
Queens,88
Staten Island,16


In [33]:
#once more with columns
pivot_df = df.pivot_table('Price as number', index='borough', columns='Style')      
pivot_df

Style,Grandma,Jumbo,Margherita,Meatball,Pepperoni,Plain,Sicilian,Stuffed Crust Plain,White
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bronx,,5.0,,,4.35,2.772727,,,
Brooklyn,,,4.0,4.0,3.37,2.483333,2.5,3.5,4.0
Manhattan,4.6,5.5,3.75,,3.870811,2.387143,,,
Queens,,,,,4.0,2.582976,2.5,,
Staten Island,,,,,,2.433333,3.81,,


In [34]:
pd.crosstab(df['borough'], df['Style'])

Style,Grandma,Jumbo,Margherita,Meatball,Pepperoni,Plain,Sicilian,Stuffed Crust Plain,White
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bronx,0,1,0,0,1,33,0,0,0
Brooklyn,0,0,1,1,13,117,1,1,2
Manhattan,2,1,1,0,37,147,0,0,0
Queens,0,0,0,0,3,84,1,0,0
Staten Island,0,0,0,0,0,15,1,0,0


## **Group By**

A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be used to group large amounts of data and compute operations on these groups.

In [35]:
# one group 
group_df = pizza_df.groupby('borough')['Price as number'].mean()
group_df

borough
Bronx            2.881429
Brooklyn         2.620294
Manhattan        2.726489
Queens           2.630341
Staten Island    2.519375
Name: Price as number, dtype: float64

In [36]:
# two groups
df2 = joined_df.groupby(['borough','Style'])['Price as number'].mean()
df2

borough        Style              
Bronx          Jumbo                  5.000000
               Pepperoni              4.350000
               Plain                  2.772727
Brooklyn       Margherita             4.000000
               Meatball               4.000000
               Pepperoni              3.370000
               Plain                  2.483333
               Sicilian               2.500000
               Stuffed Crust Plain    3.500000
               White                  4.000000
Manhattan      Grandma                4.600000
               Jumbo                  5.500000
               Margherita             3.750000
               Pepperoni              3.870811
               Plain                  2.387143
Queens         Pepperoni              4.000000
               Plain                  2.582976
               Sicilian               2.500000
Staten Island  Plain                  2.433333
               Sicilian               3.810000
Name: Price as number, dt

In [83]:
## unstacking gives us the same results as a pivot table with columns
df2.unstack()

Style,Grandma,Jumbo,Margherita,Meatball,Pepperoni,Plain,Sicilian,Stuffed Crust Plain,White
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bronx,,5.0,,,4.35,2.772727,,,
Brooklyn,,,4.0,4.0,3.37,2.483333,2.5,3.5,4.0
Manhattan,4.6,5.5,3.75,,3.870811,2.387143,,,
Queens,,,,,4.0,2.582976,2.5,,
Staten Island,,,,,,2.433333,3.81,,


In [118]:
df3 = joined_df.groupby(['borough','Style'])['Price as number'].describe()
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
borough,Style,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bronx,Jumbo,1.0,5.0,,5.0,5.0,5.0,5.0,5.0
Bronx,Pepperoni,1.0,4.35,,4.35,4.35,4.35,4.35,4.35
Bronx,Plain,33.0,2.772727,0.401966,2.0,2.5,3.0,3.0,3.5
Brooklyn,Margherita,1.0,4.0,,4.0,4.0,4.0,4.0,4.0
Brooklyn,Meatball,1.0,4.0,,4.0,4.0,4.0,4.0,4.0
Brooklyn,Pepperoni,13.0,3.37,0.603904,2.0,3.0,3.25,3.75,4.5
Brooklyn,Plain,117.0,2.483333,0.575244,1.0,2.25,2.5,2.75,5.0
Brooklyn,Sicilian,1.0,2.5,,2.5,2.5,2.5,2.5,2.5
Brooklyn,Stuffed Crust Plain,1.0,3.5,,3.5,3.5,3.5,3.5,3.5
Brooklyn,White,2.0,4.0,0.353553,3.75,3.875,4.0,4.125,4.25


## **Joins**

Join columns with other DataFrame either on index or on a key column. Efficiently join multiple DataFrame objects by index at once by passing a list. Left join by default.

* Uses the first row of the right table as index

[For more information on joins click here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.join.html)

In [42]:
df_NYC_pop = pd.read_json('Data/NYCPopulation.json')
df_NYC_pop.head(3)

Unnamed: 0,Group,borough,1950,1950 - Boro share of NYC total,2020 Population,2020 - Boro share of NYC total
0,Total Population,NYC Total,7891957,100.0,8550971,100.0
1,Total Population,Bronx,1451277,18.39,1446788,16.92
2,Total Population,Brooklyn,2738175,34.7,2648452,30.97


In [80]:
joined_df = pizza_df.join(df_NYC_pop.set_index('borough'), on='borough', how = 'inner')
joined_df.head(3)

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country,Group,1950,1950 - Boro share of NYC total,2020 Population,2020 - Boro share of NYC total
0,Angelos Pizza,40.623254,-73.937922,2022-1014,"Oct 14th 2022, 5:57:51 pm",2022,3.0,$3.00,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States,Total Population,2738175,34.7,2648452,30.97
2,Pino Pizza,40.600015,-73.999455,2022-1003,"Oct 3rd 2022, 5:47:23 pm",2022,2.75,$2.75,Plain,Brooklyn,21st Avenue,City of New York,New York,United States,Total Population,2738175,34.7,2648452,30.97
7,N & D Pizza,40.600463,-73.943072,2022-0905,"Sep 5th 2022, 4:37:08 pm",2022,3.35,$3.35,Plain,Brooklyn,Avenue U,City of New York,New York,United States,Total Population,2738175,34.7,2648452,30.97



`how` accepts arguments that define how the tables are joined.

- left: use calling frame’s index (or column if on is specified)
- right: use other’s index.
- outer: form union of calling frame’s index (or column if on is specified) with other’s index, and sort it. lexicographically.
- inner: form intersection of calling frame’s index (or column if on is specified) with other’s index, preserving the order of the calling’s one.
- cross: creates the cartesian product from both frames, preserves the order of the left keys.

## **Merge**

- For combining data on common columns
- Re-indexes the result
- Requires common column names
- Side by side merge
- Inner join by default

In [81]:
merged = pd.merge(pizza_df, df_NYC_pop)
merged.head(3)

Unnamed: 0,Name,location_lat,location_lng,Date,Date Expanded (times in EST),Year,Price as number,Price,Style,borough,street,city,state,country,Group,1950,1950 - Boro share of NYC total,2020 Population,2020 - Boro share of NYC total
0,Angelos Pizza,40.623254,-73.937922,2022-1014,"Oct 14th 2022, 5:57:51 pm",2022,3.0,$3.00,Plain,Brooklyn,Flatbush Avenue,City of New York,New York,United States,Total Population,2738175,34.7,2648452,30.97
1,Pino Pizza,40.600015,-73.999455,2022-1003,"Oct 3rd 2022, 5:47:23 pm",2022,2.75,$2.75,Plain,Brooklyn,21st Avenue,City of New York,New York,United States,Total Population,2738175,34.7,2648452,30.97
2,N & D Pizza,40.600463,-73.943072,2022-0905,"Sep 5th 2022, 4:37:08 pm",2022,3.35,$3.35,Plain,Brooklyn,Avenue U,City of New York,New York,United States,Total Population,2738175,34.7,2648452,30.97


#### **Exporting data**

In [96]:
pizza_df.to_csv('Data/pizza_cleaned.csv')

In [97]:
pizza_df.to_excel('Data/pizza_cleaned.xlsx')

In [98]:
pizza_df.to_json(Data/'pizza_cleaned.json')