# **Restaurant Inspections New York City**

## **Imports**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
%matplotlib inline
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords

#Sklearn preprocessing
from sklearn import preprocessing,set_config
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder,StandardScaler,LabelEncoder
#Scipy
from scipy import stats
from scipy.stats import norm

#Sklearn Models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn import preprocessing, set_config
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,make_column_selector,make_column_transformer
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn.decomposition import PCA


# Warnings 
import warnings 
warnings.filterwarnings("ignore")

# Configuring diagrams
set_config(display = 'diagram')

# **Load Data**

In [2]:
rd = pd.read_csv('https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD')

# **Data Exploration**


In [3]:
# Check to see if data loaded properly:
rd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208029 entries, 0 to 208028
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  208029 non-null  int64  
 1   DBA                    207562 non-null  object 
 2   BORO                   208029 non-null  object 
 3   BUILDING               207653 non-null  object 
 4   STREET                 208023 non-null  object 
 5   ZIPCODE                205315 non-null  float64
 6   PHONE                  208022 non-null  object 
 7   CUISINE DESCRIPTION    205791 non-null  object 
 8   INSPECTION DATE        208029 non-null  object 
 9   ACTION                 205791 non-null  object 
 10  VIOLATION CODE         204640 non-null  object 
 11  VIOLATION DESCRIPTION  204640 non-null  object 
 12  CRITICAL FLAG          208029 non-null  object 
 13  SCORE                  198416 non-null  float64
 14  GRADE                  102267 non-nu

In [4]:
# Check to see if data loaded properly:
rd.head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,...,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA,Location Point1
0,41408361,BEKY BAKERY AND COFFEE SHOP,Queens,3778,103 STREET,11368.0,7183970329,Spanish,01/06/2020,No violations were recorded at the time of thi...,...,Administrative Miscellaneous / Initial Inspection,40.752738,-73.864188,403.0,21.0,40500.0,4044240.0,4017680000.0,QN26,
1,50124500,EAT OFF BEAT,Manhattan,75,9 AVENUE,10011.0,9176716197,Other,09/21/2023,Establishment re-opened by DOHMH.,...,Pre-permit (Operational) / Reopening Inspection,40.741869,-74.004713,104.0,3.0,8300.0,1012541.0,1007130000.0,MN13,
2,50142175,UVA NEXT DOOR,Manhattan,1484,2 AVENUE,10075.0,9175262090,,01/01/1900,,...,,40.772284,-73.955805,108.0,5.0,13400.0,1045199.0,1014520000.0,MN31,
3,50107467,CHANCHITOS CAFE,Brooklyn,176,ROCKAWAY AVENUE,11233.0,3477895954,,01/01/1900,,...,,40.677091,-73.910865,316.0,41.0,30100.0,3042586.0,3015670000.0,BK79,
4,50104755,FLIK INTERNATIONAL CORP.,Manhattan,599,LEXINGTON AVENUE,10022.0,7043285090,,01/01/1900,,...,,40.758268,-73.971054,106.0,4.0,10000.0,1036467.0,1013070000.0,MN19,


## **Drop Columns**

In [5]:
# Dropping irrelevant of imcomplete columns
rd.drop(['Location Point1', 'PHONE', 'BBL', 'BIN', 'NTA', 'Census Tract', 'Community Board', 'RECORD DATE'], axis=1, inplace=True)

In [6]:
# Checking to see if columns dropped 
rd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208029 entries, 0 to 208028
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  208029 non-null  int64  
 1   DBA                    207562 non-null  object 
 2   BORO                   208029 non-null  object 
 3   BUILDING               207653 non-null  object 
 4   STREET                 208023 non-null  object 
 5   ZIPCODE                205315 non-null  float64
 6   CUISINE DESCRIPTION    205791 non-null  object 
 7   INSPECTION DATE        208029 non-null  object 
 8   ACTION                 205791 non-null  object 
 9   VIOLATION CODE         204640 non-null  object 
 10  VIOLATION DESCRIPTION  204640 non-null  object 
 11  CRITICAL FLAG          208029 non-null  object 
 12  SCORE                  198416 non-null  float64
 13  GRADE                  102267 non-null  object 
 14  GRADE DATE             93483 non-nul

## **Checking for missing data**

In [12]:
# checking missing data in data 
def check_missing_data(rd):
    total = rd.isnull().sum().sort_values(ascending=True)
    percent = (rd.isnull().sum() / rd.isnull().count() * 100).sort_values(ascending=True)
    percent = percent.round(2).astype(str) + '%'  # Add the percentage sign
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
missing_data_result = check_missing_data(rd)
print(missing_data_result.head)

<bound method NDFrame.head of                           Total Percent
CAMIS                         0    0.0%
BORO                          0    0.0%
CRITICAL FLAG                 0    0.0%
INSPECTION DATE               0    0.0%
STREET                        6    0.0%
Longitude                   257   0.12%
Latitude                    257   0.12%
BUILDING                    376   0.18%
Cleaned Restaurant Name     467   0.22%
DBA                         467   0.22%
CUISINE DESCRIPTION        2238   1.08%
INSPECTION TYPE            2238   1.08%
ACTION                     2238   1.08%
ZIPCODE                    2714    1.3%
Council District           3321    1.6%
VIOLATION DESCRIPTION      3389   1.63%
VIOLATION CODE             3389   1.63%
SCORE                      9613   4.62%
GRADE                    105762  50.84%
GRADE DATE               114546  55.06%>


In [13]:
rd.to_csv('rd', index=False)