In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import requests
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
df = pd.read_csv('TransformGainesville_CrimesALL-Refactored.csv', index_col = 0)

df.head()

Unnamed: 0,ID,CFS,CFS_Type,Classification,offenseDate,offenseHour,offenseDOW,address,latitude,longitude,date,month,day,year,fullDate,DOW,time,moonPhase
0,221009267,Domestic Aggravated Battery,Battery,Person,2021-07-04 21:24:00,21,Sunday,100 BLK NW 39TH AVE,29.688534,-82.326069,,,,,,,,
1,221009608,Domestic Aggravated Battery,Battery,Person,2021-07-11 22:54:00,22,Sunday,4200 BLK SW 21ST PL,29.632687,-82.387148,,,,,,,,
2,221009391,Domestic Aggravated Battery,Battery,Person,2021-07-07 19:12:00,19,Wednesday,1200 BLK SE 19TH TER,29.640249,-82.29939,,,,,,,,
3,221009308,Domestic Aggravated Battery,Battery,Person,2021-07-06 07:26:00,7,Tuesday,1000 BLK SW 62ND BLVD,29.641625,-82.398242,,,,,,,,
4,221011388,Domestic Aggravated Battery,Battery,Person,2021-08-16 17:25:00,17,Monday,100 BLK NW 39TH AVE,29.688534,-82.326069,,,,,,,,


In [5]:
# Change name of column "offenseDate" to date
df.rename(columns={'offenseDate':'Date'}, inplace=True)
df.head()

Unnamed: 0,ID,CFS,CFS_Type,Classification,Date,offenseHour,offenseDOW,address,latitude,longitude,date,month,day,year,fullDate,DOW,time,moonPhase
0,221009267,Domestic Aggravated Battery,Battery,Person,2021-07-04 21:24:00,21,Sunday,100 BLK NW 39TH AVE,29.688534,-82.326069,,,,,,,,
1,221009608,Domestic Aggravated Battery,Battery,Person,2021-07-11 22:54:00,22,Sunday,4200 BLK SW 21ST PL,29.632687,-82.387148,,,,,,,,
2,221009391,Domestic Aggravated Battery,Battery,Person,2021-07-07 19:12:00,19,Wednesday,1200 BLK SE 19TH TER,29.640249,-82.29939,,,,,,,,
3,221009308,Domestic Aggravated Battery,Battery,Person,2021-07-06 07:26:00,7,Tuesday,1000 BLK SW 62ND BLVD,29.641625,-82.398242,,,,,,,,
4,221011388,Domestic Aggravated Battery,Battery,Person,2021-08-16 17:25:00,17,Monday,100 BLK NW 39TH AVE,29.688534,-82.326069,,,,,,,,


In [6]:
df[ 'Date'] = pd.to_datetime(df.Date, format='%Y-%m-%d')

In [7]:
df['Date'] = pd.to_datetime(df['Date']).dt.date

In [8]:
url = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Gainesville,FL/2018-01-01/2021-12-31?unitGroup=us&key=JVFDPCT4LWWPVKADN783XGRVA&include=days&elements=datetime,moonphase'


In [9]:
r = requests.get(url)

In [10]:
json = r.json()
json

{'queryCost': 1461,
 'latitude': 29.652,
 'longitude': -82.3228,
 'resolvedAddress': 'Gainesville, FL, United States',
 'address': 'Gainesville,FL',
 'timezone': 'America/New_York',
 'tzoffset': -5.0,
 'days': [{'datetime': '2018-01-01', 'moonphase': 0.5},
  {'datetime': '2018-01-02', 'moonphase': 0.5},
  {'datetime': '2018-01-03', 'moonphase': 0.52},
  {'datetime': '2018-01-04', 'moonphase': 0.54},
  {'datetime': '2018-01-05', 'moonphase': 0.58},
  {'datetime': '2018-01-06', 'moonphase': 0.63},
  {'datetime': '2018-01-07', 'moonphase': 0.68},
  {'datetime': '2018-01-08', 'moonphase': 0.73},
  {'datetime': '2018-01-09', 'moonphase': 0.78},
  {'datetime': '2018-01-10', 'moonphase': 0.83},
  {'datetime': '2018-01-11', 'moonphase': 0.88},
  {'datetime': '2018-01-12', 'moonphase': 0.92},
  {'datetime': '2018-01-13', 'moonphase': 0.95},
  {'datetime': '2018-01-14', 'moonphase': 0.98},
  {'datetime': '2018-01-15', 'moonphase': 0.99},
  {'datetime': '2018-01-16', 'moonphase': 1.0},
  {'dateti

In [11]:
json.keys()

dict_keys(['queryCost', 'latitude', 'longitude', 'resolvedAddress', 'address', 'timezone', 'tzoffset', 'days'])

In [12]:
moonphases_df = pd.DataFrame(json['days'])
moonphases_df

Unnamed: 0,datetime,moonphase
0,2018-01-01,0.50
1,2018-01-02,0.50
2,2018-01-03,0.52
3,2018-01-04,0.54
4,2018-01-05,0.58
...,...,...
1456,2021-12-27,0.78
1457,2021-12-28,0.83
1458,2021-12-29,0.88
1459,2021-12-30,0.93


In [13]:
json['days']

[{'datetime': '2018-01-01', 'moonphase': 0.5},
 {'datetime': '2018-01-02', 'moonphase': 0.5},
 {'datetime': '2018-01-03', 'moonphase': 0.52},
 {'datetime': '2018-01-04', 'moonphase': 0.54},
 {'datetime': '2018-01-05', 'moonphase': 0.58},
 {'datetime': '2018-01-06', 'moonphase': 0.63},
 {'datetime': '2018-01-07', 'moonphase': 0.68},
 {'datetime': '2018-01-08', 'moonphase': 0.73},
 {'datetime': '2018-01-09', 'moonphase': 0.78},
 {'datetime': '2018-01-10', 'moonphase': 0.83},
 {'datetime': '2018-01-11', 'moonphase': 0.88},
 {'datetime': '2018-01-12', 'moonphase': 0.92},
 {'datetime': '2018-01-13', 'moonphase': 0.95},
 {'datetime': '2018-01-14', 'moonphase': 0.98},
 {'datetime': '2018-01-15', 'moonphase': 0.99},
 {'datetime': '2018-01-16', 'moonphase': 1.0},
 {'datetime': '2018-01-17', 'moonphase': 0.0},
 {'datetime': '2018-01-18', 'moonphase': 0.01},
 {'datetime': '2018-01-19', 'moonphase': 0.03},
 {'datetime': '2018-01-20', 'moonphase': 0.05},
 {'datetime': '2018-01-21', 'moonphase': 0.0

In [14]:
#json['latitude']
#json['longitude']


In [15]:
bins = [.0 , .25 , .50 , .75, 1 ]

In [16]:
moonphases_df['moonPhases_cat'] = pd.cut(moonphases_df['moonphase'], bins)
moonphases_df.head()


Unnamed: 0,datetime,moonphase,moonPhases_cat
0,2018-01-01,0.5,"(0.25, 0.5]"
1,2018-01-02,0.5,"(0.25, 0.5]"
2,2018-01-03,0.52,"(0.5, 0.75]"
3,2018-01-04,0.54,"(0.5, 0.75]"
4,2018-01-05,0.58,"(0.5, 0.75]"


In [17]:
# Labels = 1 = New Moon , 2 = Full Moon, 3 Third Quarter =  4 = back to new moon

labels =[1,2,3,4]

moonphases_df['moonPhases'] = pd.cut(moonphases_df['moonphase'], bins,labels=labels)
moonphases_df.head()

Unnamed: 0,datetime,moonphase,moonPhases_cat,moonPhases
0,2018-01-01,0.5,"(0.25, 0.5]",2
1,2018-01-02,0.5,"(0.25, 0.5]",2
2,2018-01-03,0.52,"(0.5, 0.75]",3
3,2018-01-04,0.54,"(0.5, 0.75]",3
4,2018-01-05,0.58,"(0.5, 0.75]",3


In [18]:
new_moonphasesdf = moonphases_df[['datetime','moonPhases']]
new_moonphasesdf

Unnamed: 0,datetime,moonPhases
0,2018-01-01,2
1,2018-01-02,2
2,2018-01-03,3
3,2018-01-04,3
4,2018-01-05,3
...,...,...
1456,2021-12-27,4
1457,2021-12-28,4
1458,2021-12-29,4
1459,2021-12-30,4


In [19]:
new_moonphasesdf.rename(columns={'datetime': 'Date'}, inplace=True)


In [20]:
new_moonphasesdf['Date'] = pd.to_datetime(new_moonphasesdf.Date, format='%Y-%m-%d')

In [21]:
new_moonphasesdf['Date'] = pd.to_datetime(new_moonphasesdf['Date']).dt.date

In [22]:
new_moonphasesdf

Unnamed: 0,Date,moonPhases
0,2018-01-01,2
1,2018-01-02,2
2,2018-01-03,3
3,2018-01-04,3
4,2018-01-05,3
...,...,...
1456,2021-12-27,4
1457,2021-12-28,4
1458,2021-12-29,4
1459,2021-12-30,4


In [23]:
df.columns

Index(['ID', 'CFS', 'CFS_Type', 'Classification', 'Date', 'offenseHour',
       'offenseDOW', 'address', 'latitude', 'longitude', 'date ', 'month',
       'day', 'year', 'fullDate', 'DOW', 'time ', 'moonPhase'],
      dtype='object')

In [24]:
df.head()

Unnamed: 0,ID,CFS,CFS_Type,Classification,Date,offenseHour,offenseDOW,address,latitude,longitude,date,month,day,year,fullDate,DOW,time,moonPhase
0,221009267,Domestic Aggravated Battery,Battery,Person,2021-07-04,21,Sunday,100 BLK NW 39TH AVE,29.688534,-82.326069,,,,,,,,
1,221009608,Domestic Aggravated Battery,Battery,Person,2021-07-11,22,Sunday,4200 BLK SW 21ST PL,29.632687,-82.387148,,,,,,,,
2,221009391,Domestic Aggravated Battery,Battery,Person,2021-07-07,19,Wednesday,1200 BLK SE 19TH TER,29.640249,-82.29939,,,,,,,,
3,221009308,Domestic Aggravated Battery,Battery,Person,2021-07-06,7,Tuesday,1000 BLK SW 62ND BLVD,29.641625,-82.398242,,,,,,,,
4,221011388,Domestic Aggravated Battery,Battery,Person,2021-08-16,17,Monday,100 BLK NW 39TH AVE,29.688534,-82.326069,,,,,,,,


In [25]:
# Assign the uncessary columns to variable and drop
# Keeping longitude,latitude.

unused_cols = ['CFS','offenseHour','offenseDOW','address' ,'date ', 'month','day','year', 'fullDate','DOW','time ','moonPhase']
unused_cols
df = df.drop(unused_cols, axis = 1)

df.head(25)


Unnamed: 0,ID,CFS_Type,Classification,Date,latitude,longitude
0,221009267,Battery,Person,2021-07-04,29.688534,-82.326069
1,221009608,Battery,Person,2021-07-11,29.632687,-82.387148
2,221009391,Battery,Person,2021-07-07,29.640249,-82.29939
3,221009308,Battery,Person,2021-07-06,29.641625,-82.398242
4,221011388,Battery,Person,2021-08-16,29.688534,-82.326069
5,221011524,Battery,Person,2021-08-19,29.631246,-82.319771
6,221012057,Battery,Person,2021-08-28,29.704114,-82.372561
7,221012231,Battery,Person,2021-08-31,29.684413,-82.305793
8,221012341,Battery,Person,2021-09-02,29.696642,-82.384909
9,221013249,Battery,Person,2021-09-19,29.616533,-82.367391


In [35]:
#join on the Date column

new_df= df.join(new_moonphasesdf.set_index('Date'), on='Date')
new_df.head(50)

Unnamed: 0,ID,CFS_Type,Classification,Date,latitude,longitude,moonPhases
0,221009267,Battery,Person,2021-07-04,29.688534,-82.326069,4.0
1,221009608,Battery,Person,2021-07-11,29.632687,-82.387148,1.0
2,221009391,Battery,Person,2021-07-07,29.640249,-82.29939,4.0
3,221009308,Battery,Person,2021-07-06,29.641625,-82.398242,4.0
4,221011388,Battery,Person,2021-08-16,29.688534,-82.326069,2.0
5,221011524,Battery,Person,2021-08-19,29.631246,-82.319771,2.0
6,221012057,Battery,Person,2021-08-28,29.704114,-82.372561,3.0
7,221012231,Battery,Person,2021-08-31,29.684413,-82.305793,4.0
8,221012341,Battery,Person,2021-09-02,29.696642,-82.384909,4.0
9,221013249,Battery,Person,2021-09-19,29.616533,-82.367391,2.0
