![USD](usd.png)

### University of San Diego 

### Master of Science, Applied Data Science 

#### Contributors

- Ebad Akhter
- Kevin Baum
- Salvador Sanchez

***

In [93]:
#Libraries
import pandas as pd
from datetime import datetime
import numpy as np
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
import requests
import PasswordHolder

# Load Data

In [2]:
#function to download yearly data
def Get_Data(year):
    url = f"https://seshat.datasd.org/pd/pd_calls_for_service_{year}_datasd.csv"
    df = pd.read_csv(url)
    return df

In [3]:
#Current year
current_year = datetime.now().year

In [73]:
#Current Data
current_data = Get_Data(current_year)

#Last Year
last_year_data = Get_Data(current_year-1)

#Year - 2
yearM2_data = Get_Data(current_year-2)

current_data.tail()

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority
31875,E23010034178,2023-01-24 23:40:34,3,0,,GENESEE,AVE,,APPLETON,,T,K,114,2
31876,E23010034179,2023-01-24 23:42:49,3,3800,,ORANGE,AVE,,,,459A,CAN,838,2
31877,E23010034180,2023-01-24 23:43:19,3,1800,,UPAS,ST,,,,FU,K,531,2
31878,E23010034182,2023-01-24 23:44:36,3,1000,,B,ST,,,,1171,CAN,524,1
31879,E23010034185,2023-01-24 23:45:15,3,0,,COLLEGE,AVE,,MONTEZUMA,,T,K,327,2


# EDA

In [5]:
current_data.describe()

Unnamed: 0,day_of_week,address_number_primary,address_dir_intersecting,address_sfx_intersecting,beat,priority
count,30350.0,30350.0,0.0,0.0,30350.0,30350.0
mean,3.892389,3381.046787,,,492.391993,2.103295
std,1.984939,3384.459204,,,238.326237,1.213443
min,1.0,0.0,,,-1.0,0.0
25%,2.0,600.0,,,313.0,1.0
50%,4.0,2800.0,,,521.0,2.0
75%,6.0,4800.0,,,627.0,3.0
max,7.0,53500.0,,,937.0,9.0


In [6]:
last_year_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499256 entries, 0 to 499255
Data columns (total 14 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   incident_num               499256 non-null  object 
 1   date_time                  499256 non-null  object 
 2   day_of_week                499256 non-null  int64  
 3   address_number_primary     499256 non-null  int64  
 4   address_dir_primary        23135 non-null   object 
 5   address_road_primary       499187 non-null  object 
 6   address_sfx_primary        455019 non-null  object 
 7   address_dir_intersecting   0 non-null       float64
 8   address_road_intersecting  85382 non-null   object 
 9   address_sfx_intersecting   0 non-null       float64
 10  call_type                  498640 non-null  object 
 11  disposition                497161 non-null  object 
 12  beat                       499256 non-null  int64  
 13  priority                   49

In [70]:
#Expore Data
def Explore_Date(df):
    #Clean Data
    df = df.dropna(how='all', axis=1)
    
    #Initial table
    freqDF = pd.DataFrame(columns=['Feature',
                                   'Mode',
                                   'Mode Freq.',
                                   'Mode %',
                                   '2nd Mode',
                                   '2nd Mode Freq.',
                                   '2nd Mode %'])
    for col in df.columns:
        freq = df[col].value_counts()
        freqdf = freq.to_frame()
        fRow = freqdf.iloc[0]
        secRow = freqdf.iloc[1]
        fPrct = fRow[0] / len(df[col])
        secPrct = secRow[0] / len(df[col])
        try:
            mode1 = int(fRow.name)
        except:
            mode1 = fRow.name
        try:
            mode2 = int(secRow.name)
        except:
            mode2 = secRow.name
        data = {'Feature':col,
                'Mode':mode1,
                'Mode Freq.':fRow[0],
                'Mode %':fPrct,\
                '2nd Mode':mode2,
                '2nd Mode Freq.':secRow[0],
                '2nd Mode %':secPrct}
        freqDF.loc[len(freqDF)] = data

    freqDF = freqDF.set_index('Feature')

    #Nulls, Counts, Cardinality
    NUllFeatures = round(df.isnull().sum() / df.shape[0],4)\
          .sort_values(ascending=False)
    Count = df.count()
    uni = df.nunique()

    #Formating
    NUllFeatures.to_frame(name="% Miss.")
    Count.to_frame(name="Count")
    uni.to_frame()
    result = pd.concat([Count, NUllFeatures,uni], axis=1)
    result.columns =["Count","% Miss.","Card."]
    result = pd.concat([result, freqDF], axis=1)
    result = result.style.format({'% Miss.': "{:.1%}",
                         'Mode %': "{:.0%}",
                         '2nd Mode %': "{:.0%}",
                         'Count': "{:,}",
                         'Card.': "{:,}",
                         'Mode Freq.': "{:,}",
                        '2nd Mode Freq.': "{:,}"})
    return result

In [75]:
Explore_Date(current_data)

Unnamed: 0,Count,% Miss.,Card.,Mode,Mode Freq.,Mode %,2nd Mode,2nd Mode Freq.,2nd Mode %
incident_num,31880,0.0%,31880,E23010000001,1,0%,E23010022833,1,0%
date_time,31880,0.0%,31621,2023-01-20 11:21:40,3,0%,2023-01-12 11:13:25,3,0%
day_of_week,31880,0.0%,7,3,5598,18%,2,4936,15%
address_number_primary,31880,0.0%,200,0,5615,18%,1400,738,2%
address_dir_primary,1587,95.0%,8,S,532,2%,W,496,2%
address_road_primary,31876,0.0%,3559,IMPERIAL,703,2%,05TH,471,1%
address_sfx_primary,29111,8.7%,26,ST,9408,30%,AVE,7445,23%
address_road_intersecting,5208,83.7%,1277,MARKET,125,0%,IMPERIAL,122,0%
call_type,31857,0.1%,211,415,2427,8%,SELENF,1983,6%
disposition,31769,0.4%,14,K,16722,52%,O,4601,14%


In [72]:
Explore_Date(last_year_data)

Unnamed: 0,Count,% Miss.,Card.,Mode,Mode Freq.,Mode %,2nd Mode,2nd Mode Freq.,2nd Mode %
incident_num,499256,0.0%,499256,E22010000001,1,0%,E22080042092,1,0%
date_time,499256,0.0%,495214,2022-10-25 10:09:39,3,0%,2022-09-21 13:23:37,3,0%
day_of_week,499256,0.0%,7,5,76734,15%,6,74910,15%
address_number_primary,499256,0.0%,233,0,92027,18%,1400,11347,2%
address_dir_primary,23135,95.4%,8,W,8001,2%,S,7227,1%
address_road_primary,499187,0.0%,10076,IMPERIAL,8701,2%,05TH,8389,2%
address_sfx_primary,455019,8.9%,40,ST,146772,29%,AVE,116831,23%
address_road_intersecting,85382,82.9%,4556,MARKET,2263,0%,UNIVERSITY,2213,0%
call_type,498640,0.1%,238,415,42668,9%,CW,24684,5%
disposition,497161,0.4%,19,K,259055,52%,O,71580,14%


In [74]:
Explore_Date(yearM2_data)

Unnamed: 0,Count,% Miss.,Card.,Mode,Mode Freq.,Mode %,2nd Mode,2nd Mode Freq.,2nd Mode %
incident_num,568947,0.0%,568947,E21010000001,1,0%,E21080040927,1,0%
date_time,568947,0.0%,563769,2021-09-11 17:49:41,3,0%,2021-12-12 16:39:26,3,0%
day_of_week,568947,0.0%,7,6,84636,15%,5,83508,15%
address_number_primary,568947,0.0%,237,0,102341,18%,4000,10980,2%
address_dir_primary,27114,95.2%,9,W,10033,2%,S,8669,2%
address_road_primary,568832,0.0%,10353,IMPERIAL,10651,2%,05TH,9082,2%
address_sfx_primary,521939,8.3%,40,ST,171706,30%,AVE,135702,24%
address_road_intersecting,95028,83.3%,4750,MARKET,2686,0%,UNIVERSITY,2500,0%
call_type,568510,0.1%,243,415,51777,9%,SELENF,37704,7%
disposition,567355,0.3%,18,K,296902,52%,O,86013,15%


# Data Prep

In [9]:
def AddressNumberStr(df):
    df['address_number_primary_str'] = df['address_number_primary'].astype(str)
    df.address_number_primary_str.replace('0', np.nan, inplace=True)
    return df

In [22]:
def AddressField(df,City,State):
    df['Address'] = df[['address_number_primary_str',
                        'address_dir_primary',
                        'address_road_primary',
                        'address_sfx_primary']].apply(lambda x: ' '.join(x.dropna()), axis=1)
    df['Address'] = df['Address'] + ' ' + City +', ' + State
    return df

In [27]:
Address_Data = AddressNumberStr(current_data)
Address_Data = AddressField(current_data,'San Diego','California')

In [30]:
Address_Data.tail()

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority,address_number_primary_str,Address
30345,E23010032562,2023-01-23 23:35:53,2,0,,HOITT,ST,,ISLAND,,T,K,515,2,,"HOITT ST San Diego, California"
30346,E23010032563,2023-01-23 23:38:24,2,400,,HOTEL CIRCLE SOUTH,,,,,5150,CAN,623,2,400.0,"400 HOTEL CIRCLE SOUTH San Diego, California"
30347,E23010032565,2023-01-23 23:39:02,2,600,,32ND,ST,,,,T,K,516,2,600.0,"600 32ND ST San Diego, California"
30348,E23010032567,2023-01-23 23:44:30,2,13100,,SEA KNOLL,CT,,,,586,O,934,4,13100.0,"13100 SEA KNOLL CT San Diego, California"
30349,E23010032570,2023-01-23 23:49:32,2,0,,E,ST,,16TH,,1131,K,521,1,,"E ST San Diego, California"


# Geo Location Data Google Lat and Lonf

In [111]:
def extract_lat_long_via_address(address):
    lat, lng, zipcode  = None, None, None
    api_key = GOOGLE_API_KEY
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    endpoint = f"{base_url}?address={address}&key={api_key}"
    r = requests.get(endpoint)
    if r.status_code not in range(200, 299):
        #error
        return None, None
    try:
        #found
        results = r.json()['results'][0]
        lat = results['geometry']['location']['lat']
        lng = results['geometry']['location']['lng']
        zipcode = results['address_components'][-1]['long_name']
        
    except:
        pass
    return lat, lng, zipcode

def enrich_with_geocoding_api(row):
    column_name = 'Address'
    address_value = row[column_name]
    address_lat, address_lng, address_zip = extract_lat_long_via_address(address_value)
    row['lat'] = address_lat
    row['lng'] = address_lng
    row['zipcode'] = address_zip
    
    return row

In [118]:
Last2000 = Address_Data.tail(2000)
Last10 = Address_Data.tail(10)

In [134]:
Last10_Geo = Last10.apply(enrich_with_geocoding_api, axis=1)

In [136]:
Last10_Geo

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority,address_number_primary_str,Address,lat,lng,zipcode
30340,E23010032551,2023-01-23 23:19:57,2,5900,,UNIVERSITY,AVE,,,,T,K,822,2,5900.0,"5900 UNIVERSITY AVE San Diego, California",32.750251,-117.070386,92115
30341,E23010032552,2023-01-23 23:20:53,2,1900,,EL CAJON,BLV,,,,911,CAN,624,1,1900.0,"1900 EL CAJON BLV San Diego, California",32.755362,-117.144998,1005
30342,E23010032558,2023-01-23 23:29:24,2,0,,GARNET,AVE,,INGRAHAM,,1150,K,122,2,,"GARNET AVE San Diego, California",32.801114,-117.234831,United States
30343,E23010032560,2023-01-23 23:32:57,2,8800,,NAVAJO,RD,,,,911,CAN,324,1,8800.0,"8800 NAVAJO RD San Diego, California",32.802984,-117.008199,92119
30344,E23010032561,2023-01-23 23:35:23,2,13200,,LUCKETT,CT,,,,1153,K,934,2,13200.0,"13200 LUCKETT CT San Diego, California",32.959768,-117.206969,3225
30345,E23010032562,2023-01-23 23:35:53,2,0,,HOITT,ST,,ISLAND,,T,K,515,2,,"HOITT ST San Diego, California",32.70956,-117.132944,92102
30346,E23010032563,2023-01-23 23:38:24,2,400,,HOTEL CIRCLE SOUTH,,,,,5150,CAN,623,2,400.0,"400 HOTEL CIRCLE SOUTH San Diego, California",32.759391,-117.177008,92108
30347,E23010032565,2023-01-23 23:39:02,2,600,,32ND,ST,,,,T,K,516,2,600.0,"600 32ND ST San Diego, California",32.711728,-117.125265,3302
30348,E23010032567,2023-01-23 23:44:30,2,13100,,SEA KNOLL,CT,,,,586,O,934,4,13100.0,"13100 SEA KNOLL CT San Diego, California",32.957585,-117.205701,3201
30349,E23010032570,2023-01-23 23:49:32,2,0,,E,ST,,16TH,,1131,K,521,1,,"E ST San Diego, California",32.714783,-117.139686,United States


In [98]:
Last10.to_csv('Last10.csv')

In [119]:
Last2000_Geo = Last2000.apply(enrich_with_geocoding_api, axis=1)

In [121]:
Last2000_Geo.to_csv('Last2000_Geo.csv')

In [122]:
Last2000_Geo

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority,address_number_primary_str,Address,lat,lng,zipcode
28350,E23010030360,2023-01-22 11:16:00,1,4200,,ETA,ST,,,,459A,U,443,3,4200,"4200 ETA ST San Diego, California",32.686686,-117.103788,4329
28351,E23010030361,2023-01-22 11:17:20,1,6400,,LAKE KATHLEEN,AVE,,,,11-7,K,324,1,6400,"6400 LAKE KATHLEEN AVE San Diego, California",32.798486,-117.023010,3133
28352,E23010030362,2023-01-22 11:18:59,1,0,,CAPE MAY,AVE,,ABBOTT,,586,O,614,4,,"CAPE MAY AVE San Diego, California",32.747257,-117.245409,92107
28353,E23010030363,2023-01-22 11:19:20,1,3500,,EUCLID,AVE,,,,602,K,833,2,3500,"3500 EUCLID AVE San Diego, California",32.742169,-117.092341,2926
28354,E23010030364,2023-01-22 11:23:00,1,1000,,FIESTA ISLAND,RD,,,,1151,O,123,2,1000,"1000 FIESTA ISLAND RD San Diego, California",32.780353,-117.221882,8402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30345,E23010032562,2023-01-23 23:35:53,2,0,,HOITT,ST,,ISLAND,,T,K,515,2,,"HOITT ST San Diego, California",32.709560,-117.132944,92102
30346,E23010032563,2023-01-23 23:38:24,2,400,,HOTEL CIRCLE SOUTH,,,,,5150,CAN,623,2,400,"400 HOTEL CIRCLE SOUTH San Diego, California",32.759391,-117.177008,92108
30347,E23010032565,2023-01-23 23:39:02,2,600,,32ND,ST,,,,T,K,516,2,600,"600 32ND ST San Diego, California",32.711728,-117.125265,3302
30348,E23010032567,2023-01-23 23:44:30,2,13100,,SEA KNOLL,CT,,,,586,O,934,4,13100,"13100 SEA KNOLL CT San Diego, California",32.957585,-117.205701,3201


# Geo Location Data Zipcode

In [128]:
locator = Nominatim(user_agent='myGeocoder')

geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
df['location_geo'] = df['Address'].apply(geocode)
df['point'] = df['location_geo'].apply(lambda loc: tuple(loc.point) if loc else (0, 0, 0))
df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(),index=df.index)                                                         

In [152]:
def get_zipcode(df, geolocator, lat_field, lon_field):
    try:
        location = geolocator.reverse((df[lat_field], df[lon_field]))
        result = location.raw['address']['postcode']
    except:
        result = None
    return result


In [124]:
df = Last10.copy()

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority,address_number_primary_str,Address
30340,E23010032551,2023-01-23 23:19:57,2,5900,,UNIVERSITY,AVE,,,,T,K,822,2,5900.0,"5900 UNIVERSITY AVE San Diego, California"
30341,E23010032552,2023-01-23 23:20:53,2,1900,,EL CAJON,BLV,,,,911,CAN,624,1,1900.0,"1900 EL CAJON BLV San Diego, California"
30342,E23010032558,2023-01-23 23:29:24,2,0,,GARNET,AVE,,INGRAHAM,,1150,K,122,2,,"GARNET AVE San Diego, California"
30343,E23010032560,2023-01-23 23:32:57,2,8800,,NAVAJO,RD,,,,911,CAN,324,1,8800.0,"8800 NAVAJO RD San Diego, California"
30344,E23010032561,2023-01-23 23:35:23,2,13200,,LUCKETT,CT,,,,1153,K,934,2,13200.0,"13200 LUCKETT CT San Diego, California"
30345,E23010032562,2023-01-23 23:35:53,2,0,,HOITT,ST,,ISLAND,,T,K,515,2,,"HOITT ST San Diego, California"
30346,E23010032563,2023-01-23 23:38:24,2,400,,HOTEL CIRCLE SOUTH,,,,,5150,CAN,623,2,400.0,"400 HOTEL CIRCLE SOUTH San Diego, California"
30347,E23010032565,2023-01-23 23:39:02,2,600,,32ND,ST,,,,T,K,516,2,600.0,"600 32ND ST San Diego, California"
30348,E23010032567,2023-01-23 23:44:30,2,13100,,SEA KNOLL,CT,,,,586,O,934,4,13100.0,"13100 SEA KNOLL CT San Diego, California"
30349,E23010032570,2023-01-23 23:49:32,2,0,,E,ST,,16TH,,1131,K,521,1,,"E ST San Diego, California"


In [154]:
zipcodes = Last10_Geo.apply(get_zipcode,
                            axis=1,
                            geolocator=locator,
                            lat_field='lat',
                            lon_field='lng'
                           )

In [155]:
zipcodes

30340    92115
30341    92104
30342    92109
30343    92119
30344    92130
30345    92102
30346    92103
30347    92102
30348    92130
30349    92102
dtype: object

In [None]:
Last2000_Geo['Zipcodes'] = Last2000_Geo.apply(get_zipcode,
                            axis=1,
                            geolocator=locator,
                            lat_field='lat',
                            lon_field='lng'
                           )

In [None]:
Last2000_Geo.head()