1. Import

In [1]:
import pandas as pd
# import data
data = pd.read_csv('Crime_Data_from_2020_to_Present.csv')
print(len(data))
duplicate_rows = data.duplicated().sum()  # 重复的行数
missing_values = data.isnull().sum()  # 每列的缺失值数量
print("重复行数：", duplicate_rows)
print("{:<18} {:<6} {}".format("字段名称", "字段类型", "缺失值数量"))
for i in range(len(data.columns)):
    print("{:<20} {:<10} {}".format(data.columns[i], str(data.dtypes.iloc[i]), str(missing_values.iloc[i])))

data.head()

852950
重复行数： 0
字段名称               字段类型   缺失值数量
division_number      int64      0
date_reported        object     0
date_occurred        object     0
area                 int64      0
area_name            object     0
reporting_district   int64      0
part                 int64      0
crime_code           int64      0
crime_description    object     0
modus_operandi       object     118311
victim_age           int64      0
victim_sex           object     112606
victim_descent       object     112614
premise_code         float64    10
premise_description  object     518
weapon_code          float64    556202
weapon_description   object     556202
status               object     0
status_description   object     0
crime_code_1         float64    11
crime_code_2         float64    790429
crime_code_3         float64    850837
crime_code_4         float64    852888
location             object     0
cross_street         object     717289
latitude             float64    0
longitude           

Unnamed: 0,division_number,date_reported,date_occurred,area,area_name,reporting_district,part,crime_code,crime_description,modus_operandi,...,status,status_description,crime_code_1,crime_code_2,crime_code_3,crime_code_4,location,cross_street,latitude,longitude
0,10304468,2020-01-08,2020-01-08 22:30:00,3,Southwest,377,2,624,BATTERY - SIMPLE ASSAULT,0444 0913,...,AO,Adult Other,624.0,,,,1100 W 39TH PL,,34.0141,-118.2978
1,190101086,2020-01-02,2020-01-01 03:30:00,1,Central,163,2,624,BATTERY - SIMPLE ASSAULT,0416 1822 1414,...,IC,Invest Cont,624.0,,,,700 S HILL ST,,34.0459,-118.2545
2,200110444,2020-04-14,2020-02-13 12:00:00,1,Central,155,2,845,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,1501,...,AA,Adult Arrest,845.0,,,,200 E 6TH ST,,34.0448,-118.2474
3,191501505,2020-01-01,2020-01-01 17:30:00,15,N Hollywood,1543,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),0329 1402,...,IC,Invest Cont,745.0,998.0,,,5400 CORTEEN PL,,34.1685,-118.4019
4,191921269,2020-01-01,2020-01-01 04:15:00,19,Mission,1998,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0329,...,IC,Invest Cont,740.0,,,,14400 TITUS ST,,34.2198,-118.4468


2. Clean Data

In [2]:
# 提取需要用到的数据：
# 返回三个DataFrame数据集：有用的数据（包括特征列、标签列以及全称（如犯罪描述列））、特征、标签
def get_usefulData_feature_label(data):

    # 用字符串"Unknown"代替object类型的列中的缺失值，-1代替float、int.
    def fill_the_blank(data):
        for column in data.columns:
            if data[column].dtype == 'object':
                data[column].fillna('Unknown', inplace=True)
            elif data[column].dtype in ['float64', 'int64']:
                data[column].fillna(-1, inplace=True)
    
    # 检查数据中是否还有空值
    def check(data):
        return data.isnull().sum().sum() == 0
    
    # Convert date columns to datetime
    data['date_occurred'] = pd.to_datetime(data['date_occurred'])
    data['month_day'] = data['date_occurred'].dt.strftime('%m-%d') # 月日
    data['specific_time'] = data['date_occurred'].dt.strftime('%H:%M:%S') # 时分秒

    # 1、提取相应列
    useful_data = data[['date_occurred','month_day','specific_time','area','area_name','victim_age','victim_sex','victim_descent','latitude','longitude','crime_code','crime_description','premise_code','premise_description','weapon_code','weapon_description']].copy()
    feature = data[['month_day','area','victim_age','victim_sex','victim_descent','latitude','longitude']].copy()
    label = data[['specific_time','crime_code','premise_code','weapon_code']].copy()

    # 2、填补缺失值
    fill_the_blank(useful_data)
    fill_the_blank(feature)
    fill_the_blank(label)

    # 3、如果检查没有空值则返回数据
    if check(useful_data) and check(feature) and check(label):
        print("空值已处理")
        return useful_data, feature, label
    
    raise ValueError("Some values are not valid.") 


useful_data, feature, label = get_usefulData_feature_label(data)

feature.head()

空值已处理


Unnamed: 0,month_day,area,victim_age,victim_sex,victim_descent,latitude,longitude
0,01-08,3,36,F,B,34.0141,-118.2978
1,01-01,1,25,M,H,34.0459,-118.2545
2,02-13,1,0,X,X,34.0448,-118.2474
3,01-01,15,76,F,W,34.1685,-118.4019
4,01-01,19,31,X,X,34.2198,-118.4468


In [3]:
label.head()

Unnamed: 0,specific_time,crime_code,premise_code,weapon_code
0,22:30:00,624,501.0,400.0
1,03:30:00,624,102.0,500.0
2,12:00:00,845,726.0,-1.0
3,17:30:00,745,502.0,-1.0
4,04:15:00,740,409.0,-1.0
