# **Restaurant Inspections New York City**

## **Imports**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
%matplotlib inline
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords

#Sklearn preprocessing
from sklearn import preprocessing,set_config
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder,StandardScaler,LabelEncoder
#Scipy
from scipy import stats
from scipy.stats import norm

#Sklearn Models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn import preprocessing, set_config
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,make_column_selector,make_column_transformer
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn.decomposition import PCA


# Warnings 
import warnings 
warnings.filterwarnings("ignore")

# Configuring diagrams
set_config(display = 'diagram')

# **Load Data**

In [2]:
rd = pd.read_csv('https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD')

# **Data Exploration**


In [3]:
# Check to see if data loaded properly:
rd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209141 entries, 0 to 209140
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  209141 non-null  int64  
 1   DBA                    208544 non-null  object 
 2   BORO                   209141 non-null  object 
 3   BUILDING               208824 non-null  object 
 4   STREET                 209135 non-null  object 
 5   ZIPCODE                206455 non-null  float64
 6   PHONE                  209133 non-null  object 
 7   CUISINE DESCRIPTION    206702 non-null  object 
 8   INSPECTION DATE        209141 non-null  object 
 9   ACTION                 206702 non-null  object 
 10  VIOLATION CODE         205553 non-null  object 
 11  VIOLATION DESCRIPTION  205553 non-null  object 
 12  CRITICAL FLAG          209141 non-null  object 
 13  SCORE                  199209 non-null  float64
 14  GRADE                  102462 non-nu

In [4]:
# Check to see if data loaded properly:
rd.head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,...,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA,Location Point1
0,50143627,,Manhattan,857,9 AVENUE,10019.0,2127772627,,01/01/1900,,...,,40.767057,-73.986343,104.0,6.0,13900.0,1026848.0,1010650000.0,MN15,
1,50132462,PENDING,Queens,8011,NORTHERN BLVD,11372.0,9293284330,,01/01/1900,,...,,40.755386,-73.88702,403.0,25.0,30903.0,4026898.0,4011760000.0,QN28,
2,40652017,PIZZA D'AMORE,Brooklyn,8949,BAY PARKWAY,11214.0,7182664433,Pizza,10/18/2023,Violations were cited in the following area(s).,...,Cycle Inspection / Initial Inspection,40.595221,-74.000771,311.0,43.0,29400.0,3345607.0,3064910000.0,BK28,
3,50139548,,Manhattan,131,1 AVENUE,10003.0,2016866752,,01/01/1900,,...,,40.727497,-73.98537,103.0,2.0,3800.0,1006333.0,1004490000.0,MN22,
4,50056951,NEUBERGER BERMAN CAFE,Manhattan,1290,AVE AMERICAS,,3472666551,American,04/27/2023,Violations were cited in the following area(s).,...,Cycle Inspection / Initial Inspection,0.0,0.0,,,,,1.0,,


## **Drop Columns**

In [5]:
# Dropping irrelevant of imcomplete columns
rd.drop(['Location Point1', 'PHONE', 'BBL', 'BIN', 'NTA', 'Census Tract', 'Community Board', 'RECORD DATE'], axis=1, inplace=True)

In [6]:
# Checking to see if columns dropped 
rd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209141 entries, 0 to 209140
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  209141 non-null  int64  
 1   DBA                    208544 non-null  object 
 2   BORO                   209141 non-null  object 
 3   BUILDING               208824 non-null  object 
 4   STREET                 209135 non-null  object 
 5   ZIPCODE                206455 non-null  float64
 6   CUISINE DESCRIPTION    206702 non-null  object 
 7   INSPECTION DATE        209141 non-null  object 
 8   ACTION                 206702 non-null  object 
 9   VIOLATION CODE         205553 non-null  object 
 10  VIOLATION DESCRIPTION  205553 non-null  object 
 11  CRITICAL FLAG          209141 non-null  object 
 12  SCORE                  199209 non-null  float64
 13  GRADE                  102462 non-null  object 
 14  GRADE DATE             93826 non-nul

## **Checking for missing data**

In [7]:
# checking missing data in data 
def check_missing_data(rd):
    total = rd.isnull().sum().sort_values(ascending=True)
    percent = (rd.isnull().sum() / rd.isnull().count() * 100).sort_values(ascending=True)
    percent = percent.round(2).astype(str) + '%'  # Add the percentage sign
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
missing_data_result = check_missing_data(rd)
print(missing_data_result.head)

<bound method NDFrame.head of                         Total Percent
CAMIS                       0    0.0%
BORO                        0    0.0%
INSPECTION DATE             0    0.0%
CRITICAL FLAG               0    0.0%
STREET                      6    0.0%
Longitude                 260   0.12%
Latitude                  260   0.12%
BUILDING                  317   0.15%
DBA                       597   0.29%
CUISINE DESCRIPTION      2439   1.17%
ACTION                   2439   1.17%
INSPECTION TYPE          2439   1.17%
ZIPCODE                  2686   1.28%
Council District         3223   1.54%
VIOLATION DESCRIPTION    3588   1.72%
VIOLATION CODE           3588   1.72%
SCORE                    9932   4.75%
GRADE                  106679  51.01%
GRADE DATE             115315  55.14%>


In [8]:
rd.to_csv('rd', index=False)