In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_rows', 1000)

In [2]:
df = pd.read_csv('Crime_Data_from_2020_to_Present.csv')

In [3]:
df.head(10)

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,190326475,03/01/2020 12:00:00 AM,03/01/2020 12:00:00 AM,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,...,AA,Adult Arrest,510.0,998.0,,,1900 S LONGWOOD AV,,34.0375,-118.3506
1,200106753,02/09/2020 12:00:00 AM,02/08/2020 12:00:00 AM,1800,1,Central,182,1,330,BURGLARY FROM VEHICLE,...,IC,Invest Cont,330.0,998.0,,,1000 S FLOWER ST,,34.0444,-118.2628
2,200320258,11/11/2020 12:00:00 AM,11/04/2020 12:00:00 AM,1700,3,Southwest,356,1,480,BIKE - STOLEN,...,IC,Invest Cont,480.0,,,,1400 W 37TH ST,,34.021,-118.3002
3,200907217,05/10/2023 12:00:00 AM,03/10/2020 12:00:00 AM,2037,9,Van Nuys,964,1,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),...,IC,Invest Cont,343.0,,,,14000 RIVERSIDE DR,,34.1576,-118.4387
4,220614831,08/18/2022 12:00:00 AM,08/17/2020 12:00:00 AM,1200,6,Hollywood,666,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,1900 TRANSIENT,,34.0944,-118.3277
5,231808869,04/04/2023 12:00:00 AM,12/01/2020 12:00:00 AM,2300,18,Southeast,1826,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,9900 COMPTON AV,,33.9467,-118.2463
6,230110144,04/04/2023 12:00:00 AM,07/03/2020 12:00:00 AM,900,1,Central,182,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,1100 S GRAND AV,,34.0415,-118.262
7,220314085,07/22/2022 12:00:00 AM,05/12/2020 12:00:00 AM,1110,3,Southwest,303,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,2500 S SYCAMORE AV,,34.0335,-118.3537
8,231309864,04/28/2023 12:00:00 AM,12/09/2020 12:00:00 AM,1400,13,Newton,1375,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,1300 E 57TH ST,,33.9911,-118.2521
9,211904005,12/31/2020 12:00:00 AM,12/31/2020 12:00:00 AM,1220,19,Mission,1974,2,624,BATTERY - SIMPLE ASSAULT,...,IC,Invest Cont,624.0,,,,9000 CEDROS AV,,34.2336,-118.4535


In [4]:
df.columns

Index(['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME',
       'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes',
       'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc',
       'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1',
       'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT',
       'LON'],
      dtype='object')

In [5]:
df.duplicated().sum()

0

In [6]:
assault_crimes = ['BATTERY - SIMPLE ASSAULT',
'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT',
'INTIMATE PARTNER - SIMPLE ASSAULT',
'INTIMATE PARTNER - AGGRAVATED ASSAULT',
'CHILD ABUSE (PHYSICAL) - SIMPLE ASSAULT',
'OTHER ASSAULT',
'CHILD ABUSE (PHYSICAL) - AGGRAVATED ASSAULT',
'ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER']

In [7]:
df_assault = df[df['Crm Cd Desc'].isin(assault_crimes)==True].copy() # We consider only Assault crimes

In [8]:
df_assault = df_assault.reset_index()

In [9]:
columns_to_drop = df[['AREA','Part 1-2','Status','DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'Crm Cd 1','Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4','Mocodes','Crm Cd','Weapon Used Cd','Weapon Desc','Cross Street','LOCATION', 'LAT','LON','Rpt Dist No']]
df_assault.drop(columns=columns_to_drop, inplace = True)

In [10]:
df_assault.drop(columns=['index'])

Unnamed: 0,AREA NAME,Crm Cd Desc,Vict Age,Vict Sex,Vict Descent,Premis Cd,Premis Desc,Status Desc
0,Mission,BATTERY - SIMPLE ASSAULT,26,M,H,502.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",Invest Cont
1,Wilshire,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",31,F,O,101.0,STREET,Adult Arrest
2,West LA,BATTERY - SIMPLE ASSAULT,24,F,O,501.0,SINGLE FAMILY DWELLING,Invest Cont
3,West LA,INTIMATE PARTNER - SIMPLE ASSAULT,42,F,H,501.0,SINGLE FAMILY DWELLING,Invest Cont
4,Southwest,INTIMATE PARTNER - SIMPLE ASSAULT,25,F,B,501.0,SINGLE FAMILY DWELLING,Invest Cont
...,...,...,...,...,...,...,...,...
196497,Central,BATTERY - SIMPLE ASSAULT,60,M,B,903.0,MTA - RED LINE - 7TH AND METRO CENTER,Invest Cont
196498,Harbor,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",32,M,W,101.0,STREET,Invest Cont
196499,Foothill,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",36,M,H,101.0,STREET,Invest Cont
196500,Olympic,BATTERY - SIMPLE ASSAULT,80,F,O,124.0,BUS STOP,Invest Cont


### We use One-Hot Encoding to encode the categorical data representing the 21 LA Patrol Divisions.

In [12]:
df_assault['AREA NAME'].unique()

array(['Mission', 'Wilshire', 'West LA', 'Southwest', 'N Hollywood',
       'Central', 'West Valley', 'Hollywood', 'Olympic', '77th Street',
       'Devonshire', 'Harbor', 'Pacific', 'Northeast', 'Topanga',
       'Van Nuys', 'Newton', 'Hollenbeck', 'Foothill', 'Rampart',
       'Southeast'], dtype=object)

In [13]:
df_assault.isna().sum()

index            0
AREA NAME        0
Crm Cd Desc      0
Vict Age         0
Vict Sex        24
Vict Descent    26
Premis Cd        1
Premis Desc     49
Status Desc      0
dtype: int64

### Since there are no missing values in the AREA NAME column, we can perform One Hot Encoding directly without having to impute any missing values.

In [15]:
enc = OneHotEncoder()
enc.fit(df_assault[['AREA NAME']])
one_hot = enc.transform(df_assault[['AREA NAME']]).toarray()
df_assault[['Mission', 'Wilshire', 'West LA', 'Southwest', 'N Hollywood',
       'Central', 'West Valley', 'Hollywood', 'Olympic', '77th Street',
       'Devonshire', 'Harbor', 'Pacific', 'Northeast', 'Topanga',
       'Van Nuys', 'Newton', 'Hollenbeck', 'Foothill', 'Rampart',
       'Southeast']]=one_hot

In [16]:
df_assault = df_assault.dropna(subset=['Premis Cd']) # Since there is only 1 NaN value in the Premis Cd column, we choose to drop that row

In [17]:
df_assault['Vict Sex'].unique() # Check what values are in the victim sex column

array(['M', 'F', 'X', nan, 'H'], dtype=object)

### For this data set, there should only be two options for the victim's sex: M or F. So, we replace all other labels with NaN.

In [19]:
df_assault.loc[:,'Vict Sex']=df_assault['Vict Sex'].replace({'X':np.nan,'H':np.nan,'-':np.nan}) # Change all unkown values ('X' entries)
df_assault['Vict Sex'].value_counts()

Vict Sex
F    98775
M    95715
Name: count, dtype: int64

In [20]:
ip_crimes = ['INTIMATE PARTNER - SIMPLE ASSAULT',
'INTIMATE PARTNER - AGGRAVATED ASSAULT',]

In [21]:
df[df['Crm Cd Desc'].isin(ip_crimes)==True]['Vict Age'].unique()

array([42, 25, 29, 59, 43, 26, 10, 33, 22, 28, 20, 18, 21, 50, 35, 16, 34,
       23, 38, 40, 45, 30, 31, 27, 24, 61, 37, 54, 32, 48, 57, 39, 49, 52,
       46, 55, 19, 63, 56, 47, 81, 62, 41, 36,  0, 44, 51, 73, 60, 17, 53,
       76, 78, 66, 65, 58, 77, 64, 74, 71, 79, 68, 80, 70, 69, 12, 67, 15,
       13, 83, 75, 98, 99, 14,  4, 82,  9, 96, 85, 72, 87, 84, 88,  7,  8,
       86, 89,  3, 11, 93,  2,  5,  6, 97, 94, -1], dtype=int64)

### It's reasonable to think that some of the ages of the victims of intimate partner violence are not accurately reported. This is because several ages are listed as 0, 2, 3, 4, etc. Therefore, we replace these values with NaN and impute them later.

In [23]:
ccrime = ['CHILD ABUSE (PHYSICAL) - AGGRAVATED ASSAULT','CHILD ABUSE (PHYSICAL) - SIMPLE ASSAULT']

In [24]:
# We replace all victim ages less than 18 for crimes other than child abuse with NaN.
condition = (df_assault['Vict Age'] < 19) & (~df_assault['Crm Cd Desc'].isin(ccrime))
df_assault.loc[condition, 'Vict Age'] = np.nan

### We now check that the ages for victims of child abuse make sense.

In [26]:
df[df['Crm Cd Desc'].isin(ccrime)==True]['Vict Age'].value_counts()

Vict Age
15    419
16    377
14    374
13    344
12    321
17    284
11    255
7     218
8     216
9     210
10    203
0     196
5     178
6     175
4     143
3     107
2      82
18      8
45      3
30      3
25      3
31      3
29      3
19      3
99      2
20      2
51      2
21      2
23      2
60      2
32      2
39      2
41      2
22      1
54      1
38      1
27      1
53      1
37      1
55      1
36      1
40      1
94      1
89      1
93      1
34      1
63      1
26      1
Name: count, dtype: int64

### Again, we observe that not all ages makes sense; an 18 year old is no longer a minor and therefore, would not be a victim of child abuse. Hence, we replace all child abuse victim ages greater than 17 with NaN.

In [28]:
condition = (df_assault['Vict Age'] > 17) & (df_assault['Crm Cd Desc'].isin(ccrime))
df_assault.loc[condition, 'Vict Age'] = np.nan

In [29]:
df_assault['Vict Descent'].value_counts()

Vict Descent
H    99106
B    46557
W    31402
O    12106
A     3910
X     2935
K      261
F       86
I       35
G       16
C       14
J       14
U        9
V        9
Z        6
P        5
S        3
D        1
Name: count, dtype: int64

In [30]:
df_assault.loc[df_assault['Vict Descent']=='X','Vict Descent'] = np.nan # 'X' values are unkowns; we impute these values.

In [31]:
df_assault['Vict Descent'].value_counts() # Check to see that there are no X values.

Vict Descent
H    99106
B    46557
W    31402
O    12106
A     3910
K      261
F       86
I       35
G       16
C       14
J       14
U        9
V        9
Z        6
P        5
S        3
D        1
Name: count, dtype: int64

In [32]:
df_assault.isna().sum()

index               0
AREA NAME           0
Crm Cd Desc         0
Vict Age        13887
Vict Sex         2011
Vict Descent     2961
Premis Cd           0
Premis Desc        48
Status Desc         0
Mission             0
Wilshire            0
West LA             0
Southwest           0
N Hollywood         0
Central             0
West Valley         0
Hollywood           0
Olympic             0
77th Street         0
Devonshire          0
Harbor              0
Pacific             0
Northeast           0
Topanga             0
Van Nuys            0
Newton              0
Hollenbeck          0
Foothill            0
Rampart             0
Southeast           0
dtype: int64

In [33]:
df_assault = df_assault.dropna(subset=['Premis Desc'])

In [34]:
private_property_labels = {'SINGLE FAMILY DWELLING', 'MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)', 'OTHER BUSINESS', 'RESTAURANT/FAST FOOD','DRIVEWAY', 'HOTEL', 'VEHICLE, PASSENGER/TRUCK', 'YARD (RESIDENTIAL/BUSINESS)', 'OTHER RESIDENCE', 'MOTEL','MINI-MART', 'NURSING/CONVALESCENT/RETIREMENT HOME', 'GARAGE/CARPORT', 'TRANSIENT ENCAMPMENT', 'LAUNDROMAT','BAR/COCKTAIL/NIGHTCLUB', 'LIQUOR STORE', 'TRANSITIONAL HOUSING/HALFWAY HOUSE', 'GROUP HOME', 'NIGHT CLUB (OPEN EVENINGS ONLY)','MOBILE HOME/TRAILERS/CONSTRUCTION TRAILERS/RV\'S/MOTORHOME', 'PARKING UNDERGROUND/BUILDING', 'DRUG STORE','DEPARTMENT STORE', 'COFFEE SHOP (STARBUCKS, COFFEE BEAN, PEET\'S, ETC.)', 'CONDOMINIUM/TOWNHOUSE','OFFICE BUILDING/OFFICE', 'MEDICAL/DENTAL OFFICES', 'AUTO REPAIR SHOP', 'HEALTH SPA/GYM','DISCOUNT STORE (99 CENT,DOLLAR,ETC.)', 'SINGLE RESIDENCE OCCUPANCY (SRO\'S) LOCATIONS', 'PORCH, RESIDENTIAL','BAR/SPORTS BAR (OPEN DAY & NIGHT)', 'PROJECT/TENEMENT/PUBLIC HOUSING', 'CLOTHING STORE', 'SPECIALTY SCHOOL/OTHER','WAREHOUSE', 'THEATRE/MOVIE', 'PATIO*', 'CAR WASH', 'APARTMENT/CONDO COMMON LAUNDRY ROOM', 'CELL PHONE STORE','DRIVE THRU*', 'BEAUTY/BARBER SHOP', 'BANK', 'ELEVATOR', 'DIY CENTER (LOWE\'S, HOME DEPOT, OSH, CONTRACTORS WAREHOUSE)','SHOPPING MALL (COMMON AREA)', 'PRIVATE SCHOOL/PRESCHOOL', 'AUTO SUPPLY STORE*', 'SHORT-TERM VACATION RENTAL','SEX ORIENTED/BOOK STORE/STRIP CLUB/GENTLEMAN\'S CLUB', 'THE GROVE', 'JEWELRY STORE', 'MEMBERSHIP STORE (COSTCO, SAMS CLUB)*','FRAT HOUSE/SORORITY/DORMITORY', 'CONVENTION CENTER', 'BOWLING ALLEY*', 'FURNITURE STORE', 'NURSERY/FLOWER SHOP','FOSTER HOME BOYS OR GIRLS*', 'STORAGE SHED', 'BEAUTY SUPPLY STORE', 'AUTOMATED TELLER MACHINE (ATM)', 'MASSAGE PARLOR','TATTOO PARLOR*', 'CEMETARY*', 'BALCONY*', 'CHECK CASHING*', 'SYNAGOGUE/TEMPLE', 'MOSQUE*', 'PAWN SHOP', 'MUSEUM','GUN/SPORTING GOODS', 'TV/RADIO/APPLIANCE', 'TAXI', 'MORTUARY', 'VALET', 'PET STORE', 'DAY CARE/CHILDREN*','PAWN SHOP', 'GUN/SPORTING GOODS', 'MANUFACTURING COMPANY', 'ABORTION CLINIC/ABORTION FACILITY*', 'HIGH-RISE BUILDING','DELVERY SERVICE (FED EX, UPS, COURIERS, COURIER SERVICE)*', 'BOOK STORE', 'ABORTION CLINIC/ABORTION FACILITY*','CATERING/ICE CREAM TRUCK', 'GARMENT MANUFACTURER', 'METHADONE CLINIC', 'GARMENT MANUFACTURER', 'TRUCK, COMMERCIAL','ENERGY PLANT/FACILITY', 'RECORD-CD MUSIC/COMPUTER GAME STORE', 'MAIL BOX', 'OIL REFINERY', 'OPTICAL OFFICE INSIDE STORE OR SUPERMARKET*','CLEANER/LAUNDROMAT', 'GOLF COURSE*', 'AMUSEMENT PARK*', 'STUDIO (FILM/PHOTOGRAPHIC/MUSIC)', 'FOSTER HOME BOYS OR GIRLS*','HOUSEBOAT', 'VETERINARIAN/ANIMAL HOSPITAL', 'SWAP MEET'
}

In [35]:
df_assault['Premis Desc'] = df_assault['Premis Desc'].apply(lambda x: 'public_property' if x in private_property_labels else 'private_property')
df_assault.drop(columns=['index','AREA NAME','Crm Cd Desc','Premis Cd','Status Desc'],inplace=True)

In [36]:
labels = list(df_assault)
labels[1],labels[-1]=labels[-1],labels[1]
df_assault = df_assault[labels]
df_assault

Unnamed: 0,Vict Age,Southeast,Vict Descent,Premis Desc,Mission,Wilshire,West LA,Southwest,N Hollywood,Central,...,Harbor,Pacific,Northeast,Topanga,Van Nuys,Newton,Hollenbeck,Foothill,Rampart,Vict Sex
0,26.0,0.0,H,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M
1,31.0,1.0,O,private_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,F
2,24.0,0.0,O,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,F
3,42.0,0.0,H,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,F
4,25.0,0.0,B,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196497,60.0,0.0,B,private_property,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M
196498,32.0,0.0,W,private_property,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M
196499,36.0,0.0,H,private_property,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M
196500,80.0,0.0,O,private_property,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,F


In [37]:
label_encoder = LabelEncoder()
df_assault['Premis Desc_Encoded'] = label_encoder.fit_transform(df_assault['Premis Desc'])
df_assault

Unnamed: 0,Vict Age,Southeast,Vict Descent,Premis Desc,Mission,Wilshire,West LA,Southwest,N Hollywood,Central,...,Pacific,Northeast,Topanga,Van Nuys,Newton,Hollenbeck,Foothill,Rampart,Vict Sex,Premis Desc_Encoded
0,26.0,0.0,H,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,1
1,31.0,1.0,O,private_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,F,0
2,24.0,0.0,O,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,F,1
3,42.0,0.0,H,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,F,1
4,25.0,0.0,B,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,F,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196497,60.0,0.0,B,private_property,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,0
196498,32.0,0.0,W,private_property,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,0
196499,36.0,0.0,H,private_property,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,0
196500,80.0,0.0,O,private_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,F,0


In [38]:
df_assault['Vict Descent'].unique()

array(['H', 'O', 'B', 'A', 'W', 'K', 'F', nan, 'I', 'V', 'G', 'Z', 'D',
       'U', 'J', 'C', 'S', 'P'], dtype=object)

In [39]:
df_assault['Vict Descent'].value_counts()

Vict Descent
H    99092
B    46538
W    31397
O    12099
A     3909
K      261
F       86
I       35
G       16
C       14
J       14
U        9
V        9
Z        6
P        5
S        3
D        1
Name: count, dtype: int64

In [40]:
df_assault.isna().sum()

Vict Age               13885
Southeast                  0
Vict Descent            2959
Premis Desc                0
Mission                    0
Wilshire                   0
West LA                    0
Southwest                  0
N Hollywood                0
Central                    0
West Valley                0
Hollywood                  0
Olympic                    0
77th Street                0
Devonshire                 0
Harbor                     0
Pacific                    0
Northeast                  0
Topanga                    0
Van Nuys                   0
Newton                     0
Hollenbeck                 0
Foothill                   0
Rampart                    0
Vict Sex                2009
Premis Desc_Encoded        0
dtype: int64

In [41]:
df_assault.tail(10)

Unnamed: 0,Vict Age,Southeast,Vict Descent,Premis Desc,Mission,Wilshire,West LA,Southwest,N Hollywood,Central,...,Pacific,Northeast,Topanga,Van Nuys,Newton,Hollenbeck,Foothill,Rampart,Vict Sex,Premis Desc_Encoded
196492,,1.0,H,private_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,0
196493,5.0,0.0,H,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,F,1
196494,,0.0,H,private_property,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,0
196495,,0.0,H,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,M,1
196496,70.0,0.0,W,public_property,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,1
196497,60.0,0.0,B,private_property,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,0
196498,32.0,0.0,W,private_property,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,0
196499,36.0,0.0,H,private_property,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,0
196500,80.0,0.0,O,private_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,F,0
196501,70.0,0.0,W,private_property,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,F,0


In [42]:
# Prepare the columns with missing values for imputation
columns_to_encode = ['Vict Descent', 'Vict Sex','Vict Age']

for col in columns_to_encode:
    le = LabelEncoder()
    nan_mask = df_assault[col].isna()
    df_assault.loc[~nan_mask, col] = le.fit_transform(df_assault.loc[~nan_mask, col])
    
df_assault.isna().sum() # Check which columns need imputed

Vict Age               13885
Southeast                  0
Vict Descent            2959
Premis Desc                0
Mission                    0
Wilshire                   0
West LA                    0
Southwest                  0
N Hollywood                0
Central                    0
West Valley                0
Hollywood                  0
Olympic                    0
77th Street                0
Devonshire                 0
Harbor                     0
Pacific                    0
Northeast                  0
Topanga                    0
Van Nuys                   0
Newton                     0
Hollenbeck                 0
Foothill                   0
Rampart                    0
Vict Sex                2009
Premis Desc_Encoded        0
dtype: int64

In [43]:
df_assault.drop('Premis Desc',axis=1,inplace=True)

In [44]:
imputer = KNNImputer(n_neighbors=10)
imputed_data = imputer.fit_transform(df_assault)
df_temp = pd.DataFrame(imputed_data, columns=df_assault.columns)
df_temp = df_temp.round()
df_assault['Vict Sex'] = df_temp['Vict Sex']
df_assault['Vict Descent'] = df_temp['Vict Descent']
df_assault['Vict Age'] = df_temp['Vict Age']

In [45]:
df_assault['Vict Sex'].value_counts()

Vict Sex
0.0    99924
1.0    96480
Name: count, dtype: int64

In [46]:
df_assault.isna().sum()

Vict Age               49
Southeast               0
Vict Descent           49
Mission                 0
Wilshire                0
West LA                 0
Southwest               0
N Hollywood             0
Central                 0
West Valley             0
Hollywood               0
Olympic                 0
77th Street             0
Devonshire              0
Harbor                  0
Pacific                 0
Northeast               0
Topanga                 0
Van Nuys                0
Newton                  0
Hollenbeck              0
Foothill                0
Rampart                 0
Vict Sex               49
Premis Desc_Encoded     0
dtype: int64

In [47]:
df_assault['Vict Sex'].isna().sum()

49

In [48]:
df_assault['Vict Sex'].unique()

array([ 1.,  0., nan])

In [49]:
df_assault['Vict Descent'].unique()

array([ 6., 10.,  1.,  0., 15.,  9.,  4.,  5.,  8.,  7.,  2.,  3., 14.,
       11., 16., 12., 13., nan])

In [51]:
df_assault['Vict Age'].unique()

array([24., 29., 22., 40., 23., 27., 57., 41., 38., 31., 54., 28.,  4.,
       20., 50., 11., 37., 79., 26., 46., 47., 18., 19., 48., 33., 34.,
        9., 21., 32., 36., 30., 43.,  2., 13., 12.,  3., 25., 44.,  5.,
       52., 45., 60., 51., 59.,  0., 35., 42., 88.,  6., 55., 14., 15.,
       76., 39., 17., 56., 10., 66., 58., 64., 53., 68., 83., 61., 63.,
       77., 49., 67., 94.,  8., 75., 62., 65., 74., 86., 73., 70., 71.,
       72., 69., 80.,  7., 16., 92.,  1., 85., 81., 87., 82., 78., 89.,
       91., 84., 97., 96., 90., 93., 95., 98., nan])

In [53]:
# Drop all remaining NaN values

df_assault = df_assault.dropna()

### We want to apply one-hot encoding to the Vict Sex column.

In [55]:
df_assault = df_assault.copy()
df_assault.loc[:, 'Vict Sex'] = df_assault['Vict Sex'].replace('Vict Sex', 'Female')

In [56]:
df_assault['Male']=df_assault['Vict Sex']

In [57]:
df_assault

Unnamed: 0,Vict Age,Southeast,Vict Descent,Mission,Wilshire,West LA,Southwest,N Hollywood,Central,West Valley,...,Northeast,Topanga,Van Nuys,Newton,Hollenbeck,Foothill,Rampart,Vict Sex,Premis Desc_Encoded,Male
0,24.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0
1,29.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2,22.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0
3,40.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0
4,23.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196448,58.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0
196449,30.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0
196450,34.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0
196451,78.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0


In [58]:
# Switch order of columns so that the Male and Vict Sex columns are side-by-side
labels = list(df_assault)
labels[-2],labels[-1]=labels[-1],labels[-2]
df_assault = df_assault[labels]
df_assault

Unnamed: 0,Vict Age,Southeast,Vict Descent,Mission,Wilshire,West LA,Southwest,N Hollywood,Central,West Valley,...,Northeast,Topanga,Van Nuys,Newton,Hollenbeck,Foothill,Rampart,Vict Sex,Male,Premis Desc_Encoded
0,24.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
1,29.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,22.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
3,40.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
4,23.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196448,58.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
196449,30.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
196450,34.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
196451,78.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [59]:
df_assault['Vict Sex'] = df_assault['Male'].apply(lambda x: 1.0 if x==0.0 else 0.0)

In [60]:
df_assault.rename(columns= {'Vict Sex': 'Female'}, inplace=True)

In [61]:
df_assault

Unnamed: 0,Vict Age,Southeast,Vict Descent,Mission,Wilshire,West LA,Southwest,N Hollywood,Central,West Valley,...,Northeast,Topanga,Van Nuys,Newton,Hollenbeck,Foothill,Rampart,Female,Male,Premis Desc_Encoded
0,24.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,29.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,22.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
3,40.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
4,23.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196448,58.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
196449,30.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
196450,34.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
196451,78.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
