In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import glob
import pandas as pd
os.chdir("/content/drive/MyDrive/csv")

In [3]:
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [4]:
# combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
# export to csv
combined_csv.to_csv("/content/drive/MyDrive/csv/final/combined_csv.csv", index=False)

In [5]:
import pandas as pd
import numpy as np

In [6]:
combined_csv = pd.read_csv('/content/drive/MyDrive/csv/final/combined_csv.csv')
print(combined_csv.shape)
combined_csv.head()

(8104, 14)


Unnamed: 0.1,Unnamed: 0,City,State,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
0,4,Anchorage,AK,287731,3581,32,540,621,2388,12261,1692,9038,1531,93
1,5,Bethel,AK,6544,130,1,47,3,79,132,20,84,28,12
2,6,Bristol Bay Borough,AK,852,2,0,0,0,2,20,5,8,7,0
3,7,Cordova,AK,2150,0,0,0,0,0,7,1,6,0,0
4,8,Craig,AK,1313,7,0,0,0,7,20,5,12,3,0


### Feature Engineering
Creating a crime percentage column 
- combines total crime/ population

In [7]:
# Population cannot be 0
combined_csv.loc[combined_csv['Population']==0.0]

Unnamed: 0.1,Unnamed: 0,City,State,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
3920,7,Baltimore City,MD,0,0,0,0,0,0,0,0,0,0,0
6023,77,Sunriver,OR,0,1,0,0,0,1,54,9,43,2,0


In [8]:
combined_csv = combined_csv[combined_csv.Population != 0]

In [9]:
# Crime is broken down into violent crime and property crime
# Arson is reported in conjunction with property or violent crime depending on the nature
columns = ['Violent crime', 'Property crime']

In [10]:
combined_csv[columns] = combined_csv[columns].astype(int)

In [11]:
def sum_frame_by_column(frame, new_col_name, list_of_cols_to_sum, divideby):
    frame[new_col_name] = ((frame[list_of_cols_to_sum].astype(int).sum(axis=1)/frame[divideby].astype(int))*100).round(2)
    return(frame)

In [12]:
sum_frame_by_column(combined_csv, 'Crime Percentage', columns, 'Population' )

Unnamed: 0.1,Unnamed: 0,City,State,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson,Crime Percentage
0,4,Anchorage,AK,287731,3581,32,540,621,2388,12261,1692,9038,1531,93,5.51
1,5,Bethel,AK,6544,130,1,47,3,79,132,20,84,28,12,4.00
2,6,Bristol Bay Borough,AK,852,2,0,0,0,2,20,5,8,7,0,2.58
3,7,Cordova,AK,2150,0,0,0,0,0,7,1,6,0,0,0.33
4,8,Craig,AK,1313,7,0,0,0,7,20,5,12,3,0,2.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8099,31,Sheridan,WY,17895,9,0,4,0,5,369,75,278,16,3,2.11
8100,32,Thermopolis,WY,2830,13,0,0,0,13,34,7,22,5,0,1.66
8101,33,Torrington,WY,6709,13,0,4,1,8,48,8,40,0,0,0.91
8102,34,Wheatland,WY,3544,7,0,1,0,6,72,24,45,3,0,2.23


In [13]:
violent_crimes = ['Murder and nonnegligent manslaughter', 'Rape','Robbery', 
                  'Aggravated assault']

In [14]:
combined_csv['Violent crime']= combined_csv[violent_crimes].sum(axis=1)

In [15]:
property_crimes = ['Burglary', 'Larceny- theft', 'Motor vehicle theft']

In [16]:
combined_csv['Property crime']= combined_csv[property_crimes].sum(axis=1)

In [17]:
min(combined_csv['Crime Percentage']), max(combined_csv['Crime Percentage'])

(0.0, 2562.5)

In [18]:
# Just wanted to see the type of crime for such a high percentage
combined_csv.loc[combined_csv['Crime Percentage']==2562.5]

Unnamed: 0.1,Unnamed: 0,City,State,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson,Crime Percentage
794,71,Lakeside,CO,8,0,0,0,0,0,205,1,202,2,0,2562.5


In [19]:
def crime_rating(row):
# Separating values into levels of concerns
  if 0 <= row['Crime Percentage'] <= 25:
    val = 'Low'
  elif 26 <= row['Crime Percentage'] <= 50:
    val = 'Medium'
  elif 51 <= row['Crime Percentage'] <= 75:
    val = 'High'
  else:
    val = 'Very High'

  return val

In [20]:
combined_csv['Crime Rating'] = combined_csv.apply(crime_rating, axis=1)

In [21]:
combined_csv.to_csv("/content/drive/MyDrive/csv/final/fbi_crime_uscities.csv", index=False)

In [22]:
fbi_crime_uscities = pd.read_csv('/content/drive/MyDrive/csv/final/fbi_crime_uscities.csv')
fbi_crime_uscities.head()

Unnamed: 0.1,Unnamed: 0,City,State,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson,Crime Percentage,Crime Rating
0,4,Anchorage,AK,287731,3581,32,540,621,2388,12261,1692,9038,1531,93,5.51,Low
1,5,Bethel,AK,6544,130,1,47,3,79,132,20,84,28,12,4.0,Low
2,6,Bristol Bay Borough,AK,852,2,0,0,0,2,20,5,8,7,0,2.58,Low
3,7,Cordova,AK,2150,0,0,0,0,0,7,1,6,0,0,0.33,Low
4,8,Craig,AK,1313,7,0,0,0,7,20,5,12,3,0,2.06,Low


In [23]:
fbi_crime_uscities.loc[fbi_crime_uscities['Crime Percentage']>=100]

Unnamed: 0.1,Unnamed: 0,City,State,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson,Crime Percentage,Crime Rating
456,190,Industry,CA,201,72,0,1,41,30,1180,156,899,125,7,622.89,Very High
697,431,Vernon,CA,112,27,0,1,12,14,485,133,272,80,5,457.14,Very High
734,11,Black Hawk,CO,128,8,0,0,0,8,211,4,200,7,0,171.09,Very High
794,71,Lakeside,CO,8,0,0,0,0,0,205,1,202,2,0,2562.5,Very High
4645,431,Teterboro,NJ,68,0,0,0,0,0,102,0,99,3,0,150.0,Very High


In [24]:
len(fbi_crime_uscities.loc[fbi_crime_uscities['Crime Percentage']>=100])

5