# **Restaurant Inspections New York City**

## **Imports**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
%matplotlib inline
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords

#Sklearn preprocessing
from sklearn import preprocessing,set_config
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder,StandardScaler,LabelEncoder
#Scipy
from scipy import stats
from scipy.stats import norm

#Sklearn Models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn import preprocessing, set_config
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,make_column_selector,make_column_transformer
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn.decomposition import PCA


# Warnings 
import warnings 
warnings.filterwarnings("ignore")

# Configuring diagrams
set_config(display = 'diagram')

# **Load Data**

In [2]:
rd = pd.read_csv('https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD')

# **Data Exploration**


In [3]:
# Check to see if data loaded properly:
rd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205586 entries, 0 to 205585
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  205586 non-null  int64  
 1   DBA                    205122 non-null  object 
 2   BORO                   205586 non-null  object 
 3   BUILDING               205200 non-null  object 
 4   STREET                 205582 non-null  object 
 5   ZIPCODE                202881 non-null  float64
 6   PHONE                  205579 non-null  object 
 7   CUISINE DESCRIPTION    203371 non-null  object 
 8   INSPECTION DATE        205586 non-null  object 
 9   ACTION                 203371 non-null  object 
 10  VIOLATION CODE         202237 non-null  object 
 11  VIOLATION DESCRIPTION  202237 non-null  object 
 12  CRITICAL FLAG          205586 non-null  object 
 13  SCORE                  196136 non-null  float64
 14  GRADE                  101750 non-nu

In [4]:
# Check to see if data loaded properly:
rd.head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,...,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA,Location Point1
0,50099375,LINGERIE SHOPPE,Queens,2853,STEINWAY ST,11103.0,5168281448,,01/01/1900,,...,,40.765178,-73.913927,401.0,22.0,14700.0,4010807.0,4006630000.0,QN70,
1,50142028,,Queens,3515,DITMARS BLVD,11105.0,7187772188,,01/01/1900,,...,,40.774734,-73.908719,401.0,22.0,11300.0,4016613.0,4008240000.0,QN72,
2,50097252,RELAX,Brooklyn,68A,NEWELL STREET,11222.0,7183891665,,01/01/1900,,...,,40.724538,-73.947218,301.0,33.0,57100.0,3066972.0,3026840000.0,BK76,
3,50118741,Ace’s Pizza,Manhattan,30,ROCKEFELLER PLAZA,10112.0,5167706510,,01/01/1900,,...,,40.758747,-73.978692,105.0,4.0,10400.0,1076262.0,1012658000.0,MN17,
4,50139226,RIZE BROOKLYN,Brooklyn,137,SCHENECTADY AVENUE,11213.0,7189383131,,01/01/1900,,...,,40.674427,-73.93336,308.0,36.0,30900.0,3324618.0,3013540000.0,BK61,


## **Drop Columns**

In [5]:
# Dropping irrelevant of imcomplete columns
rd.drop(['Location Point1', 'PHONE', 'BBL', 'BIN', 'NTA', 'Census Tract', 'Community Board', 'RECORD DATE'], axis=1, inplace=True)

In [6]:
# Checking to see if columns dropped 
rd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205586 entries, 0 to 205585
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  205586 non-null  int64  
 1   DBA                    205122 non-null  object 
 2   BORO                   205586 non-null  object 
 3   BUILDING               205200 non-null  object 
 4   STREET                 205582 non-null  object 
 5   ZIPCODE                202881 non-null  float64
 6   CUISINE DESCRIPTION    203371 non-null  object 
 7   INSPECTION DATE        205586 non-null  object 
 8   ACTION                 203371 non-null  object 
 9   VIOLATION CODE         202237 non-null  object 
 10  VIOLATION DESCRIPTION  202237 non-null  object 
 11  CRITICAL FLAG          205586 non-null  object 
 12  SCORE                  196136 non-null  float64
 13  GRADE                  101750 non-null  object 
 14  GRADE DATE             92110 non-nul

## **Checking for missing data**

In [7]:
# checking missing data in data 
def check_missing_data(df):
    total = df.isnull().sum().sort_values(ascending=True)
    percent = (df.isnull().sum() / df.isnull().count() * 100).sort_values(ascending=True)
    percent = percent.round(2).astype(str) + '%'  # Add the percentage sign
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
missing_data_result = check_missing_data(rd)
print(missing_data_result.head)

<bound method NDFrame.head of                         Total Percent
CAMIS                       0    0.0%
BORO                        0    0.0%
INSPECTION DATE             0    0.0%
CRITICAL FLAG               0    0.0%
STREET                      4    0.0%
Longitude                 257   0.13%
Latitude                  257   0.13%
BUILDING                  386   0.19%
DBA                       464   0.23%
CUISINE DESCRIPTION      2215   1.08%
ACTION                   2215   1.08%
INSPECTION TYPE          2215   1.08%
ZIPCODE                  2705   1.32%
Council District         3323   1.62%
VIOLATION DESCRIPTION    3349   1.63%
VIOLATION CODE           3349   1.63%
SCORE                    9450    4.6%
GRADE                  103836  50.51%
GRADE DATE             113476   55.2%>


## **Clean up Names of Restaurants**

In [8]:
# Function to clean restaurant names
def clean_restaurant_name(name):
    # check if the input is a string 
    if isinstance(name, str):
        # Remove store numbers using regular expression 
        cleaned_name = re.sub(r'#\d+','', name)
        # Remove all numeric characters 
        cleaned_name = re.sub(r'#\d+', '', cleaned_name)
        #Remove leading and trailing whitespace
        cleaned_name = cleaned_name.strip()
        return cleaned_name
    else:
        return name #Return the input unchanged for non-string values
# Apply the clean_restaurant_name function to the 'DBA' column
rd['Cleaned Restaurant Name'] = rd['DBA'].apply(clean_restaurant_name)
# Drop the origianl 'DBA' column if you want 
#rd.drop(columns=['DBA'], inplace = True)

In [17]:
# Inspection Types
rd['INSPECTION TYPE'].value_counts()

Cycle Inspection / Initial Inspection                          112959
Cycle Inspection / Re-inspection                                36080
Pre-permit (Operational) / Initial Inspection                   30463
Pre-permit (Operational) / Re-inspection                         9085
Administrative Miscellaneous / Initial Inspection                4756
Pre-permit (Non-operational) / Initial Inspection                2447
Pre-permit (Operational) / Compliance Inspection                 1513
Cycle Inspection / Reopening Inspection                          1415
Administrative Miscellaneous / Re-inspection                      930
Pre-permit (Operational) / Reopening Inspection                   711
Cycle Inspection / Compliance Inspection                          687
Smoke-Free Air Act / Initial Inspection                           548
Trans Fat / Initial Inspection                                    381
Inter-Agency Task Force / Initial Inspection                      339
Calorie Posting / In