# **Hpothesis Testing**

# **Imports**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly


#Sklearn preprocessing
from sklearn import preprocessing,set_config
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder,StandardScaler,LabelEncoder

#Scipy
from scipy import stats
from scipy.stats import norm, pearsonr
import statsmodels.api as sm

# Warnings 
import warnings 
warnings.filterwarnings("ignore")

# Configuring diagrams
set_config(display = 'diagram')

# **Loading The Data**

In [2]:
# Loading the NYC Restaurant Data 
rd = pd.read_csv('rd')

In [3]:
# Loading the Yelp Data for NYC Restaurant Data 
ydf = pd.read_csv('ydf')

In [4]:
# Restaurant Inspection data Check 
rd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209141 entries, 0 to 209140
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  209141 non-null  int64  
 1   DBA                    208544 non-null  object 
 2   BORO                   209141 non-null  object 
 3   BUILDING               208824 non-null  object 
 4   STREET                 209135 non-null  object 
 5   ZIPCODE                206455 non-null  float64
 6   CUISINE DESCRIPTION    206702 non-null  object 
 7   INSPECTION DATE        209141 non-null  object 
 8   ACTION                 206702 non-null  object 
 9   VIOLATION CODE         205553 non-null  object 
 10  VIOLATION DESCRIPTION  205553 non-null  object 
 11  CRITICAL FLAG          209141 non-null  object 
 12  SCORE                  199209 non-null  float64
 13  GRADE                  102462 non-null  object 
 14  GRADE DATE             93826 non-nul

In [5]:
ydf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 848 entries, 0 to 847
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   alias         848 non-null    object 
 1   name          848 non-null    object 
 2   image_url     848 non-null    object 
 3   url           848 non-null    object 
 4   review_count  848 non-null    int64  
 5   categories    848 non-null    object 
 6   rating        848 non-null    float64
 7   coordinates   848 non-null    object 
 8   transactions  848 non-null    object 
 9   location      848 non-null    object 
dtypes: float64(1), int64(1), object(8)
memory usage: 66.4+ KB


# **Data Cleaning**

In [6]:
# Selecting subset of only restaurants from Brooklyn
brooklyndf = rd[rd['BORO'].str.lower()=='brooklyn']

In [7]:
# Selecting subset of only restaurants in Brooklyn that have pizza in the cuisine description 
pizza_brooklyn_subset = brooklyndf[(brooklyndf['CUISINE DESCRIPTION'].str.lower()=='pizza')]

In [8]:
# Filtering out rows with NAN in the 'GRADE' column
pizza_brooklyn_subset = pizza_brooklyn_subset.dropna(subset=['GRADE'])

In [9]:
# Take a look at Brooklyn pizza restaurants in inspection data
print(pizza_brooklyn_subset)

           CAMIS                                  DBA      BORO BUILDING  \
143     40574680                         PETE'S PLACE  Brooklyn     5620   
205     50131915                99 CENT SUPREME PIZZA  Brooklyn      297   
515     50037645                          JOE'S PIZZA  Brooklyn      216   
533     41459621  GINO'S BRICK OVEN PIZZA & TRATTORIA  Brooklyn   548550   
607     41273383                    LA BELLA PIZZERIA  Brooklyn     8509   
...          ...                                  ...       ...      ...   
208844  50099918                 JOE & SAL'S PIZZERIA  Brooklyn      353   
208855  50126709                           FINI PIZZA  Brooklyn      305   
208867  50035397                          DANNY PIZZA  Brooklyn     1620   
208871  41196869                          LAYLA JONES  Brooklyn      214   
208885  50102007               ROME TO BROOKLYN PIZZA  Brooklyn      755   

                 STREET  ZIPCODE CUISINE DESCRIPTION INSPECTION DATE  \
143            

In [10]:
# Making the names of the restaurants in both data sets compatable for the merge 
column_name = 'name'
ydf['name'] = ydf['name'].apply(lambda x: str(x).upper())

In [11]:
# Merging the two data sets on the names of the resaurants 
mdf = pd.merge(pizza_brooklyn_subset, ydf,left_on='DBA', right_on='name', how='inner', indicator=True)

#Display the merged dataframe
print(mdf)

        CAMIS                 DBA      BORO BUILDING          STREET  ZIPCODE  \
0    50037645         JOE'S PIZZA  Brooklyn      216  BEDFORD AVENUE  11249.0   
1    50037645         JOE'S PIZZA  Brooklyn      216  BEDFORD AVENUE  11249.0   
2    50037645         JOE'S PIZZA  Brooklyn      216  BEDFORD AVENUE  11249.0   
3    50037645         JOE'S PIZZA  Brooklyn      216  BEDFORD AVENUE  11249.0   
4    50037645         JOE'S PIZZA  Brooklyn      216  BEDFORD AVENUE  11249.0   
..        ...                 ...       ...      ...             ...      ...   
511  50134489  LITTLE ITALY PIZZA  Brooklyn      179  BEDFORD AVENUE  11211.0   
512  50134489  LITTLE ITALY PIZZA  Brooklyn      179  BEDFORD AVENUE  11211.0   
513  50134489  LITTLE ITALY PIZZA  Brooklyn      179  BEDFORD AVENUE  11211.0   
514  50134489  LITTLE ITALY PIZZA  Brooklyn      179  BEDFORD AVENUE  11211.0   
515  50134489  LITTLE ITALY PIZZA  Brooklyn      179  BEDFORD AVENUE  11211.0   

    CUISINE DESCRIPTION INS

In [12]:
mdf.head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,...,name,image_url,url,review_count,categories,rating,coordinates,transactions,location,_merge
0,50037645,JOE'S PIZZA,Brooklyn,216,BEDFORD AVENUE,11249.0,Pizza,05/08/2023,Violations were cited in the following area(s).,02B,...,JOE'S PIZZA,https://s3-media3.fl.yelpcdn.com/bphoto/0WEXB_...,https://www.yelp.com/biz/joes-pizza-brooklyn-8...,459,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.7168952, 'longitude': -73.9589...","['delivery', 'pickup']","{'address1': '216 Bedford Ave', 'address2': ''...",both
1,50037645,JOE'S PIZZA,Brooklyn,216,BEDFORD AVENUE,11249.0,Pizza,05/08/2023,Violations were cited in the following area(s).,02B,...,JOE'S PIZZA,https://s3-media4.fl.yelpcdn.com/bphoto/FluVgK...,https://www.yelp.com/biz/joes-pizza-new-york-1...,2071,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.7547, 'longitude': -73.98696}","['delivery', 'pickup']","{'address1': '1435 Broadway', 'address2': '', ...",both
2,50037645,JOE'S PIZZA,Brooklyn,216,BEDFORD AVENUE,11249.0,Pizza,05/08/2023,Violations were cited in the following area(s).,02B,...,JOE'S PIZZA,https://s3-media3.fl.yelpcdn.com/bphoto/FKjd9R...,https://www.yelp.com/biz/joes-pizza-new-york-4...,3074,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.73060076, 'longitude': -74.002...",['delivery'],"{'address1': '7 Carmine St', 'address2': '', '...",both
3,50037645,JOE'S PIZZA,Brooklyn,216,BEDFORD AVENUE,11249.0,Pizza,05/08/2023,Violations were cited in the following area(s).,02B,...,JOE'S PIZZA,https://s3-media2.fl.yelpcdn.com/bphoto/QXu6RG...,https://www.yelp.com/biz/joes-pizza-new-york-7...,1075,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.73331, 'longitude': -73.98763}","['delivery', 'pickup']","{'address1': '150 E 14th St', 'address2': '', ...",both
4,50037645,JOE'S PIZZA,Brooklyn,216,BEDFORD AVENUE,11249.0,Pizza,05/08/2023,Violations were cited in the following area(s).,02B,...,JOE'S PIZZA,https://s3-media4.fl.yelpcdn.com/bphoto/sUjegc...,https://www.yelp.com/biz/joes-pizza-new-york-1...,517,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.7101297729083, 'longitude': -7...","['delivery', 'pickup']","{'address1': '124 Fulton St', 'address2': '', ...",both


In [13]:
mdf['SCORE'].value_counts()

SCORE
12.0    92
13.0    76
64.0    50
27.0    41
10.0    35
9.0     35
18.0    30
7.0     18
11.0    17
0.0     16
2.0     14
6.0     13
5.0     10
3.0     10
16.0     8
24.0     8
4.0      7
8.0      7
21.0     6
37.0     5
30.0     5
23.0     4
19.0     3
22.0     3
20.0     3
Name: count, dtype: int64

In [14]:
mdf['GRADE'].value_counts()

GRADE
A    327
B    102
Z     65
C     11
P      7
N      4
Name: count, dtype: int64

## **Hypothesis Test 1:Try to find a correlation between scores on inspections and the ratings on Yelp**


 - The null hypothesis for this question would be that the restaurant inspection score and grade does not significantly affect the rating reviews on Yelp for pizza restaurants in Brooklyn NY.
 
  - The alternative hypothesis for this question would be that the restaurant inspection score and grade does affect the rating reviews on Yelp for the pizza restaurants in Brooklyn NY. 

In [15]:
#Perform Pearson correlation test 
correlation, p_value = pearsonr(mdf['SCORE'], mdf['rating'])

#Print results
print(f'Correlation: {correlation:2f}')
print(f'P-value: {p_value:4f}')

#Check for significance based on the p-value
if p_value <0.05:
    print('The correlation is statistically significant.')
else:
    print('There is no signficant correlation.')

Correlation: 0.154898
P-value: 0.000413
The correlation is statistically significant.
