# Import Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style("darkgrid")

font = {'size'   : 12}
mpl.rc('font', **font)

In [None]:

from geopy.geocoders import Nominatim

In [None]:
pip install folium

In [None]:
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster
from folium import plugins


In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

def cross_validate(model, n_splits = 10):
    
    k_fold = KFold(n_splits = n_splits)
    scores = [model.fit(X[train], y[train]).score(X[test], y[test]) for train, test in k_fold.split(X)]
    
    scores = np.percentile(scores, [40, 50, 60])
    return scores

# Processing the chicago crime data set

Removing the non required columns from the data


In [None]:

df=pd.read_csv('crimes.csv',usecols=['CASE#',
                      'DATE  OF OCCURRENCE',
                      'BLOCK', 
                      ' PRIMARY DESCRIPTION',
                      'WARD',
                      'LATITUDE',
                      'LONGITUDE'])
df.head()


# Data Cleaning and processing

In [None]:
df.columns=df.columns.str.strip()

In [None]:
df.head()

In [None]:
df.columns = df.columns.str.replace('\s{2,}', ' ')

In [None]:
df.columns = df.columns.str.replace('#', '')
df.columns = df.columns.str.replace(' ', '_')

In [None]:
df.columns = df.columns.str.lower()

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df['date_of_occurrence'] =  pd.to_datetime(df['date_of_occurrence'], format='%m/%d/%Y %I:%M:%S %p')

In [None]:
df.head()

In [None]:
df['hour'] = df['date_of_occurrence'].dt.hour
df['day_name'] = df['date_of_occurrence'].dt.day_name()
df['day'] = df['date_of_occurrence'].dt.dayofweek + 1
df['month_name'] = df['date_of_occurrence'].dt.month_name()
df['month'] = df['date_of_occurrence'].dt.month
df['year'] = df['date_of_occurrence'].dt.year
df['year_month'] = df['date_of_occurrence'].dt.to_period('M')

In [None]:
df.head()

In [None]:
df['zip']=df.block.str.split(' ').str[0]


In [None]:
df['street']=df.block.str.split(' ').str[1:].apply(', '.join)
df.head()

In [None]:
df.dropna(inplace=True)

In [None]:
df.reindex()
df.head()

# Data Visualization of crime Data Set

###    Number of crimes per month

In [None]:
df.groupby('year_month').count().plot(y='case',kind='bar',figsize=(10,6),width=0.85,colormap='tab20')
plt.xlabel('Month')
plt.ylabel('Number of Cases')
plt.title('Count of cases per month',loc='left',fontsize=18)



### Number of Crimes per day

In [None]:
df.groupby('day').count().plot(y='case',kind='bar',width=0.85,figsize=(10,6),colormap='tab20')

plt.xlabel('Day')
plt.ylabel('Number of Cases')
plt.title('Count of cases per Day',loc='left',fontsize=18)

### Number of cases per hour

In [None]:
df.groupby('hour').count().plot(y='case',kind='bar',width=0.85,figsize=(10,6),colormap='tab20')

plt.xlabel('Hour')
plt.ylabel('Number of Cases')
plt.title('Count of cases per Hour',loc='left',fontsize=18)

In [None]:
df.columns

In [None]:
df.primary_description

In [None]:
df.primary_description.nunique()

In [None]:
df[['primary_description','case']].groupby('primary_description',as_index=False).count().sort_values('case',ascending=False).head(3)

In [None]:
top_crimes = df[['primary_description', 'case']].groupby(
    ['primary_description']).count().sort_values('case', ascending=False)[:10].axes[0].tolist()

In [None]:
top_three_crimes = df[['primary_description', 'case']].groupby(
    ['primary_description']).count().sort_values('case', ascending=False)[:3].axes[0].tolist()
top_two_crimes = df[['primary_description', 'case']].groupby(
    ['primary_description']).count().sort_values('case', ascending=False)[:2].axes[0].tolist()


In [None]:
df_top_crimes = df[df['primary_description'].isin(top_crimes)].copy()

df_top3_crimes = df[df['primary_description'].isin(top_three_crimes)].copy()

In [None]:
df_top3_crimes[['case', 'primary_description', 'year_month']].pivot_table(
    index='year_month', 
    columns='primary_description',  
    aggfunc='count').plot(kind='area',
                          stacked=True,
                          figsize=(15, 6),
                               fontsize=12,
                               colormap='tab20')

In [None]:
df_top3_crimes[['case', 'primary_description', 'hour']].pivot_table(
    index='hour', 
    columns='primary_description', 
    fill_value=0, 
    aggfunc='count').plot(kind='area',
                          stacked=True,
                          figsize=(15, 6),
                               fontsize=12,
                               colormap='tab20')

plt.xlabel('Hour of Day')
plt.ylabel('Count of Cases per Hour')
plt.title('Count of Cases Per Hour]', loc='left', fontsize=18)

# Visual Crimes On Map

In [None]:
# Colours for top ten crimes
colors = [
    'red',
    'blue',
    'gray',
    'orange',
    'beige',
    'green',
    'purple',
    'pink',
    'cadetblue',
    'black'
]

In [None]:
dict_colours = dict(zip(top_crimes, colors))

In [None]:
df_top_crimes['colour'] = df_top_crimes.primary_description.map(dict_colours)

In [None]:
df_top_crimes

In [None]:
df_top_crimes_may = df_top_crimes[df_top_crimes.month_name == 'May']
df_top_crimes_may.to_pickle('crimes_may.pkl')


In [None]:
chicago_latitude = 41.85  
chicago_longitude = -87.75

chicago_map = folium.Map(location=[chicago_latitude, chicago_longitude], zoom_start=11)

chicago_map

In [None]:
incidents = folium.map.FeatureGroup()

# loop through the 100 crimes and add each to the incidents feature group
for lat, lng, col in zip(df_top_crimes_may.latitude, 
                         df_top_crimes_may.longitude, 
                         df_top_crimes_may.colour):
    incidents.add_child(
        folium.CircleMarker(
            [lat, lng],
            radius=1, # define how big you want the circle markers to be
            color=col,
            fill=True,
            fill_color=col,
            fill_opacity=0.6
        )
    )

# add incidents to map
chicago_map.add_child(incidents)

In [None]:
mc = MarkerCluster()

# Define the world map centered around Chicago with a higher zoom level
chicago_cluster = folium.Map(location=[chicago_latitude, chicago_longitude], zoom_start=11)

# display world map
chicago_cluster

#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in df_top_crimes_may.itertuples():
    mc.add_child(folium.Marker(
        location=[row.latitude,  row.longitude],
                 popup=row.primary_description))

chicago_cluster.add_child(mc)
chicago_cluster

In [None]:
from folium import plugins
from folium.plugins import HeatMap

chicago_heatmat = folium.Map(location=[chicago_latitude, chicago_longitude], zoom_start=11) 

# List comprehension to make out list of lists
heat_data = [[row['latitude'], 
              row['longitude']] for index, row in df_top_crimes_may.iterrows()]

# Plot it on the map
HeatMap(heat_data,
        min_opacity=0.5,
        max_zoom=18, 
        max_val=1.0, 
        radius=15,
        blur=20,
        gradient=None,
        overlay=True).add_to(chicago_heatmat)

# Display the map
chicago_heatmat

In [None]:
df_features=df_top_crimes[['latitude','longitude']]
df_features=df_features.join(pd.get_dummies(df_top_crimes.hour,prefix='hour'))
df_features=df_features.join(pd.get_dummies(df_top_crimes.day_name))
df_features=df_features.join(pd.get_dummies(df_top_crimes.month_name))



In [None]:
df_features['ward'] = df_top_crimes[['ward']]
df_features['crimes'] = df_top_crimes[['primary_description']]

In [None]:
df_features.head()

In [None]:
df_features_3 = df_features[df_features['crimes'].isin(top_three_crimes)].copy()
df_features_2 = df_features[df_features['crimes'].isin(top_two_crimes)].copy()

In [None]:
X_10 = df_features.copy()
y_10 = X_10.crimes.values

X_10.drop('crimes', axis=1, inplace=True)
X_10 = preprocessing.StandardScaler().fit(X_10).transform(X_10)


X_3 = df_features_3.copy()
y_3 = X_3.crimes.values

X_3.drop('crimes', axis=1, inplace=True)
X_3 = preprocessing.StandardScaler().fit(X_3).transform(X_3)

X_2 = df_features_2.copy()
y_2 = X_2.crimes.values

X_2.drop('crimes', axis=1, inplace=True)
X_2 = preprocessing.StandardScaler().fit(X_2).transform(X_2)

In [None]:
X = X_10
y = y_10
from datetime import datetime
est = range(12, 17)
scores = np.zeros((len(est), 3))
for idx, a in enumerate(est):
    print('Estimator: ', a, ' ', str(datetime.now()))
    model = RandomForestClassifier(n_estimators = a, max_features = 'sqrt')
    scores[idx, : ] = cross_validate(model, n_splits = 10)

plt.plot(est, scores[ : , 1], 'b')
plt.fill_between(est, scores[ : , 0], scores[:, 2], alpha = 0.1)
plt.legend(('Median', '(40, 60) percentile'))
plt.ylabel('Accuracy')
plt.xlabel('Number of Neighbors')
plt.tight_layout()
plt.show()

Estimator:  12   2020-05-01 17:02:33.516826
Estimator:  13   2020-05-01 17:04:47.655924
Estimator:  14   2020-05-01 17:07:09.145047
Estimator:  15   2020-05-01 17:09:43.529495


In [None]:
import requests

page = requests.get("https://foursquare.com/explore?mode=url&near=Chicago%2C%20IL%2C%20United%20States&nearGeoId=72057594042815334&q=Top%20Picks")

from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
top_venues = soup.find_all('div', class_='venueDetails')

In [None]:
venue_columns = ['id', 
                 'score', 
                 'category', 
                 'name', 
                 'address',
                 'postalcode',
                 'city',
                 'href', 
                 'latitude', 
                 'longitude']

df_top_venues = pd.DataFrame(columns=venue_columns)

for venue in top_venues:
    venue_name = venue.find(target="_blank").get_text()
    venue_score = venue.find(class_="venueScore positive").get_text()
    venue_cat = venue.find(class_="categoryName").get_text()
    venue_href = venue.find(class_="venueName").h2.a['href']
    venue_id = venue_href.split('/')[-1]

    if 'promotedTipId' in venue_id: 
        continue
        
    # Get the properly formatted address and the latitude and longitude
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(
        venue_id, 
        client_id,
        client_secret,
        version)
    
    result = requests.get(url).json()
    
    print(result)
    venue_address = result['response']['venue']['location']['address']
    venue_postalcode = result['response']['venue']['location']['postalCode']
    venue_city = result['response']['venue']['location']['city']
    venue_latitude = result['response']['venue']['location']['lat']
    venue_longitude = result['response']['venue']['location']['lng']
    
    df_top_venues = df_top_venues.append({'id': venue_id,
                                          'score': venue_score,
                                          'category': venue_cat,
                                          'name': venue_name,
                                          'address': venue_address,
                                          'postalcode': venue_postalcode,
                                          'city': venue_city,
                                          'href': venue_href,
                                          'latitude': venue_latitude,
                                          'longitude': venue_longitude}, ignore_index=True)

In [None]:
df_top_venues=pd.read_pickle('top_venues.pkl')
df_top_venues.head()

In [None]:
df_top_venues.shape

In [None]:
df_top_venues['score'] = pd.to_numeric(df_top_venues['score'], errors='coerce').fillna(0)


In [None]:
df_top_venues.head()

In [None]:
restaurants_columns = ['id',
                       'score', 
                       'category', 
                       'categoryID', 
                       'name', 
                       'address',
                       'postalcode',
                       'city',
                       'latitude',
                       'longitude', 
                       'venue_name', 
                       'venue_latitude',
                       'venue_longitude']

df_restaurant = pd.DataFrame(columns=restaurants_columns)

In [None]:
top_venue_lats = df_top_venues['latitude'].values
top_venue_lngs = df_top_venues['longitude'].values

top_venue_names = df_top_venues['name'].values

In [None]:
for ven_name, ven_lat, ven_long in zip(top_venue_names, top_venue_lats, top_venue_lngs):
    # print(ven_id, ven_name)
    
    # Configure additional Search parameters
    categoryId = '4d4b7105d754a06374d81259'
    radius = 500
    limit = 15
    
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&categoryId={}&radius={}&limit={}'.format(
        client_id,
        client_secret,
        ven_lat,
        ven_long,
        version,
        categoryId,
        radius,
        limit)
    
    results = requests.get(url).json()
    print(results)


In [None]:
df_restaurant = pd.read_pickle('restaurants.pkl')

In [None]:
df_restaurant.head()

In [None]:
df_restaurant.venue_name.nunique()

In [None]:
df_restaurant.groupby('category')['name'].count().sort_values(ascending=False)[:10]

In [None]:
top_restaurants= df_restaurant[['id', 'score','address','venue_latitude','venue_longitude','postalcode']].groupby(
    ['id','address','venue_latitude','venue_longitude','postalcode']).count().sort_values('score', ascending=False)[:10].axes[0].tolist()