1. Import

In [32]:
import pandas as pd
# import data
data = pd.read_csv('Crime_Data_from_2020_to_Present.csv')
print(len(data))
duplicate_rows = data.duplicated().sum()  # 重复的行数
missing_values = data.isnull().sum()  # 每列的缺失值数量
print("重复行数：", duplicate_rows)
print("{:<18} {:<6} {}".format("字段名称", "字段类型", "缺失值数量"))
for i in range(len(data.columns)):
    print("{:<20} {:<10} {}".format(data.columns[i], str(data.dtypes.iloc[i]), str(missing_values.iloc[i])))
# data.head()

852950
重复行数： 0
字段名称               字段类型   缺失值数量
division_number      int64      0
date_reported        object     0
date_occurred        object     0
area                 int64      0
area_name            object     0
reporting_district   int64      0
part                 int64      0
crime_code           int64      0
crime_description    object     0
modus_operandi       object     118311
victim_age           int64      0
victim_sex           object     112606
victim_descent       object     112614
premise_code         float64    10
premise_description  object     518
weapon_code          float64    556202
weapon_description   object     556202
status               object     0
status_description   object     0
crime_code_1         float64    11
crime_code_2         float64    790429
crime_code_3         float64    850837
crime_code_4         float64    852888
location             object     0
cross_street         object     717289
latitude             float64    0
longitude           

2. Clean Data

In [34]:
# 提取需要用到的数据：
# 返回三个DataFrame数据集：有用的数据（包括特征列、标签列以及全称（如犯罪描述列））、特征、标签
def get_usefulData_feature_label(data):

    # 用字符串"Unknown"代替object类型的列中的缺失值，-1代替float、int.
    def fill_the_blank(data):
        for column in data.columns:
            if data[column].dtype == 'object':
                data[column].fillna('Unknown', inplace=True)
            elif data[column].dtype in ['float64', 'int64']:
                data[column].fillna(-1, inplace=True)
    
    # 检查数据中是否还有空值
    def check(data):
        return data.isnull().sum().sum() == 0
    
    # Convert date columns to datetime
    data['date_occurred'] = pd.to_datetime(data['date_occurred'])
    print(data['date_occurred'].dtype)

    # 1、提取相应列
    useful_data = data[['date_occurred','area','area_name','victim_age','victim_sex','victim_descent','latitude','longitude','crime_code','crime_description','premise_code','premise_description','weapon_code','weapon_description']].copy()
    feature = data[['date_occurred','area','victim_age','victim_sex','victim_descent','latitude','longitude']].copy()
    label = data[['crime_code','premise_code','weapon_code']].copy()

    # 2、填补缺失值
    fill_the_blank(useful_data)
    fill_the_blank(feature)
    fill_the_blank(label)

    # 3、如果检查没有空值则返回数据
    if check(useful_data) and check(feature) and check(label):
        print("空值已处理")
        return useful_data, feature, label
    
    raise ValueError("Some values are not valid.") 


useful_data, feature, label = get_usefulData_feature_label(data)

feature.head()
    



datetime64[ns]
空值已处理


Unnamed: 0,date_occurred,area,victim_age,victim_sex,victim_descent,latitude,longitude
0,2020-01-08 22:30:00,3,36,F,B,34.0141,-118.2978
1,2020-01-01 03:30:00,1,25,M,H,34.0459,-118.2545
2,2020-02-13 12:00:00,1,0,X,X,34.0448,-118.2474
3,2020-01-01 17:30:00,15,76,F,W,34.1685,-118.4019
4,2020-01-01 04:15:00,19,31,X,X,34.2198,-118.4468
