1. Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import nbformat
from plotly.subplots import make_subplots

%config InlineBackend.figure_format = 'retina'

# import data
data = pd.read_csv('Crime_Data_from_2020_to_Present.csv')
print(len(data))
duplicate_rows = data.duplicated().sum()  # 重复的行数
missing_values = data.isnull().sum()  # 每列的缺失值数量
unique_values = data.nunique() 
print("重复行数：", duplicate_rows)
print("{:<18} {:<6} {:<6} {}".format("字段名称", "字段类型", "缺失值数量", "类别数量"))
for i in range(len(data.columns)):
    print("{:<20} {:<10} {:<10} {}".format(data.columns[i], str(data.dtypes.iloc[i]), str(missing_values.iloc[i]), str(unique_values.iloc[i])))

data.head()

852950
重复行数： 0
字段名称               字段类型   缺失值数量  类别数量
division_number      int64      0          852950
date_reported        object     0          1434
date_occurred        object     0          305841
area                 int64      0          21
area_name            object     0          21
reporting_district   int64      0          1206
part                 int64      0          2
crime_code           int64      0          138
crime_description    object     0          138
modus_operandi       object     118311     282656
victim_age           int64      0          103
victim_sex           object     112606     5
victim_descent       object     112614     20
premise_code         float64    10         313
premise_description  object     518        306
weapon_code          float64    556202     79
weapon_description   object     556202     79
status               object     0          6
status_description   object     0          6
crime_code_1         float64    11         140
crime_cod

Unnamed: 0,division_number,date_reported,date_occurred,area,area_name,reporting_district,part,crime_code,crime_description,modus_operandi,...,status,status_description,crime_code_1,crime_code_2,crime_code_3,crime_code_4,location,cross_street,latitude,longitude
0,10304468,2020-01-08,2020-01-08 22:30:00,3,Southwest,377,2,624,BATTERY - SIMPLE ASSAULT,0444 0913,...,AO,Adult Other,624.0,,,,1100 W 39TH PL,,34.0141,-118.2978
1,190101086,2020-01-02,2020-01-01 03:30:00,1,Central,163,2,624,BATTERY - SIMPLE ASSAULT,0416 1822 1414,...,IC,Invest Cont,624.0,,,,700 S HILL ST,,34.0459,-118.2545
2,200110444,2020-04-14,2020-02-13 12:00:00,1,Central,155,2,845,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,1501,...,AA,Adult Arrest,845.0,,,,200 E 6TH ST,,34.0448,-118.2474
3,191501505,2020-01-01,2020-01-01 17:30:00,15,N Hollywood,1543,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),0329 1402,...,IC,Invest Cont,745.0,998.0,,,5400 CORTEEN PL,,34.1685,-118.4019
4,191921269,2020-01-01,2020-01-01 04:15:00,19,Mission,1998,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0329,...,IC,Invest Cont,740.0,,,,14400 TITUS ST,,34.2198,-118.4468


2. Clean Data

In [2]:
# Convert date columns to datetime
data["date_occurred"] = pd.to_datetime(data["date_occurred"])
data["month"] = data["date_occurred"].dt.month
data["day"] = data["date_occurred"].dt.day
data["hour"] = data["date_occurred"].dt.hour
data["minute"] = data["date_occurred"].dt.minute

# 删除没有用的列
data.drop(
    [
        "date_occurred",
        "division_number",
        "date_reported",
        "area_name",
        "reporting_district",
        "part",
        # "crime_description",
        "modus_operandi",
        # "premise_description",
        # "weapon_description",
        # "status_description",
        "crime_code_1",
        "crime_code_2",
        "crime_code_3",
        "crime_code_4",
        "cross_street",
    ],
    axis=1,
    inplace=True,
)

# 删除存在缺失值的行
data.dropna(inplace=True)
print(data.isnull().sum().sum())
print("Number of rows after removing missing values:", data.shape[0])

data.head()


0
Number of rows after removing missing values: 296425


Unnamed: 0,area,crime_code,crime_description,victim_age,victim_sex,victim_descent,premise_code,premise_description,weapon_code,weapon_description,status,status_description,location,latitude,longitude,month,day,hour,minute
0,3,624,BATTERY - SIMPLE ASSAULT,36,F,B,501.0,SINGLE FAMILY DWELLING,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",AO,Adult Other,1100 W 39TH PL,34.0141,-118.2978,1,8,22,30
1,1,624,BATTERY - SIMPLE ASSAULT,25,M,H,102.0,SIDEWALK,500.0,UNKNOWN WEAPON/OTHER WEAPON,IC,Invest Cont,700 S HILL ST,34.0459,-118.2545,1,1,3,30
5,1,121,"RAPE, FORCIBLE",25,F,H,735.0,NIGHT CLUB (OPEN EVENINGS ONLY),500.0,UNKNOWN WEAPON/OTHER WEAPON,IC,Invest Cont,700 S BROADWAY,34.0452,-118.2534,1,1,0,30
10,1,330,BURGLARY FROM VEHICLE,29,M,A,101.0,STREET,306.0,ROCK/THROWN OBJECT,IC,Invest Cont,15TH,34.0359,-118.2648,1,4,22,0
11,1,930,CRIMINAL THREATS - NO WEAPON DISPLAYED,35,M,O,108.0,PARKING LOT,511.0,VERBAL THREAT,IC,Invest Cont,800 N ALAMEDA ST,34.0615,-118.2412,1,5,9,55


3. Data Overview

In [3]:
# 按地区area分组，查看每个地区的犯罪数量（以每个地区的经纬度均值作为标记点）
district_crime_counts = data.groupby('area').agg(latitude=('latitude', 'mean'), longitude=('longitude', 'mean'), counts=('area', 'count')).reset_index()

fig = px.scatter_mapbox(district_crime_counts, lat="latitude", lon="longitude", size="counts", color="counts", hover_name="area", color_continuous_scale="jet", hover_data=["counts", "latitude", "longitude"], zoom=9, height=750, width=1_200, title="Map of LA Crime Counts by District")
fig.update_layout(mapbox_style="open-street-map")
fig.show()

In [4]:
# 查看犯罪与月份、日期的关系
daily_crime_counts = (
    data.groupby(["month", "day"]).size().reset_index(name="crime_count")
)
pivot_table = daily_crime_counts.pivot(
    index="day", columns="month", values="crime_count"
)

monthly_crime_counts = data["month"].value_counts().sort_index()

fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Daily Crime Frequency by Month", "Monthly Crime Distribution"),
)

for month in pivot_table.columns:
    fig.add_trace(
        go.Scatter(
            x=pivot_table.index,
            y=pivot_table[month],
            mode="lines",
            name=str(month),
        ),
        row=1,
        col=1,
    )

fig.add_trace(
    go.Bar(
        x=monthly_crime_counts.index,
        y=monthly_crime_counts.values,
        marker_color="dodgerblue",
    ),
    row=1,
    col=2,
)

fig.update_layout(height=600, width=1200, template="plotly_white", showlegend=True)
fig.update_xaxes(
    title_text="Day",
    row=1,
    col=1,
    tickmode="array",
    tickvals=list(range(1, 32)),
    ticktext=list(range(1, 32)),
)
fig.update_xaxes(title_text="Month", row=1, col=2)
fig.update_yaxes(title_text="Number of Crimes", row=1, col=1)
fig.update_yaxes(title_text="Number of Crimes", row=1, col=2)

fig.show()

In [5]:
# 查看犯罪与时间的关系
hourly_crime_counts = data["hour"].value_counts().sort_index()

fig = px.bar(
    x=hourly_crime_counts.index,
    y=hourly_crime_counts.values,
    labels={"x": "Hour of the Day (0-23)", "y": "Number of Crimes"},
    color_discrete_sequence=["dodgerblue"],
)

# Updating layout for the plot
fig.update_layout(
    title="Crime Distribution by Hour of the Day",
    template="plotly_white",
    showlegend=False,
)

fig.update_xaxes(
    tickmode="array",
    tickvals=list(range(24)),
    ticktext=[str(hour) for hour in range(24)],
)

# Display the plot
fig.show()

In [6]:
# 受害者年龄分布
fig = px.histogram(
    data, x="victim_age", nbins=30, color_discrete_sequence=["dodgerblue"]
)

fig.update_layout(
    title_text="Distribution of Victim Age",
    xaxis_title_text="Victim Age",
    yaxis_title_text="Frequency",
    bargap=0.2,
    template="plotly_white",
)

fig.show()

In [7]:
valid_age_values = data[data["victim_age"] > 0]["victim_age"].values
data["victim_age"] = data["victim_age"].apply(lambda x: np.random.choice(valid_age_values) if x <= 0 else x)

fig = px.histogram(
    data, x="victim_age", nbins=30, color_discrete_sequence=["dodgerblue"]
)

fig.update_layout(
    title_text="Distribution of Victim Age",
    xaxis_title_text="Victim Age",
    yaxis_title_text="Frequency",
    bargap=0.2,
    template="plotly_white",
)

fig.show()

In [8]:
# 受害者性别和种族分布
victim_sex_data = data["victim_sex"].value_counts()
victim_descent_data = data["victim_descent"].value_counts()
total_cases = victim_descent_data.sum()

fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "bar"}]])

# Pie plot for victim_sex
fig.add_trace(
    go.Pie(
        labels=victim_sex_data.index,
        values=victim_sex_data,
        title="Victim Sex Distribution",
        textinfo="label+percent",
        insidetextorientation="radial",
    ),
    row=1,
    col=1,
)

# Horizontal bar chart for victim_descent
# 按从大到小的顺序排列
victim_descent_data = victim_descent_data.sort_values(ascending=True)

fig.add_trace(
    go.Bar(
        x=victim_descent_data.values,
        y=victim_descent_data.index,
        orientation="h",
        marker_color="dodgerblue",
        text=[
            f"{count} ({count/total_cases:.2%})" for count in victim_descent_data.values
        ],
        textposition="outside",
    ),
    row=1,
    col=2,
)

# Update layout for the bar chart
fig.update_layout(
    title_text="Victim Sex and Descent Distribution",
    template="plotly_white",
    showlegend=False,
    height=600,
)

fig.update_yaxes(title_text="Number of Cases", row=1, col=2)
fig.update_xaxes(title_text="Victim Descent", row=1, col=2)

# Display the plot
fig.show()

In [9]:
# 犯罪描述、武器和犯罪地点分布
# Top 10 most common crime descriptions (excluding 'Unknown')
top_crimes = (
    data[data["crime_description"] != "Unknown"]["crime_description"]
    .value_counts()
    .head(10)
)

# Top 10 most common weapons (excluding 'Unknown', 'UNKNOWN WEAPON/OTHER WEAPON')
top_weapons = (
    data[~data["weapon_description"].isin(["Unknown", "UNKNOWN WEAPON/OTHER WEAPON"])][
        "weapon_description"
    ]
    .value_counts()
    .head(10)
)

# Top 10 most common premise_description
top_premises = (
    data[data["premise_description"] != "Unknown"]["premise_description"]
    .value_counts()
    .head(10)
)

top_status = data[data["status_description"] != "Unknown"]["status_description"].value_counts().head(10)

# Setting up the figure with two subplots
fig = make_subplots(
    rows=4,
    cols=1,
    subplot_titles=(
        "Top 10 Crime Descriptions",
        "Top 10 Weapons Used in Crimes",
        "Top 10 Premises",
        "Top 10 Status"
    ),
)

# Horizontal bar chart for top 10 crime descriptions
fig.add_trace(
    go.Bar(
        x=top_crimes.values,
        y=top_crimes.index,
        orientation="h",
        marker_color="dodgerblue",
    ),
    row=1,
    col=1,
)

# Horizontal bar chart for top 10 weapons used
fig.add_trace(
    go.Bar(
        x=top_weapons.values, y=top_weapons.index, orientation="h", marker_color="coral"
    ),
    row=2,
    col=1,
)

fig.add_trace(
    go.Bar(
        x=top_premises.values,
        y=top_premises.index,
        orientation="h",
        marker_color="mediumseagreen",
    ),
    row=3,
    col=1,
)

fig.add_trace(
    go.Bar(
        x=top_status.values,
        y=top_status.index,
        orientation="h",
        marker_color="red",
    ),
    row=4,
    col=1,
)

# Update layout for the charts
fig.update_layout(
    height=1600,
    showlegend=False,
    template="plotly_white",
    title_text="Top 10 Crime Descriptions, Weapons Used, Premises in Crimes and Status",
)

# Inverting y-axis for both plots to display the highest value at the top
fig.update_yaxes(autorange="reversed", row=1, col=1)
fig.update_yaxes(autorange="reversed", row=2, col=1)
fig.update_yaxes(autorange="reversed", row=3, col=1)
fig.update_yaxes(autorange="reversed", row=4, col=1)

# Update x-axis titles
fig.update_xaxes(title_text="Number of Cases", row=1, col=1)
fig.update_xaxes(title_text="Number of Cases", row=2, col=1)
fig.update_xaxes(title_text="Number of Cases", row=3, col=1)
fig.update_xaxes(title_text="Number of Cases", row=4, col=1)

# Display the plot
fig.show()

4、algorithms

（1）、MLPTest

In [None]:
# MLPTest
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


data_1 = data.copy() 
print(len(data))
data_1.drop(
    [
        "crime_description",
        "premise_description",
        "weapon_description",
        "status_description",
    ],
    axis=1,
    inplace=True,
)
print(data.isnull().sum().sum())
print(data_1.isnull().sum().sum())

le = LabelEncoder()
data_1["victim_sex"] = le.fit_transform(data_1["victim_sex"])
mapping_sex = {index: label for index, label in enumerate(le.classes_)}

data_1["victim_descent"] = le.fit_transform(data_1["victim_descent"])
mapping_descent = {index: label for index, label in enumerate(le.classes_)}

data_1["weapon_code"] = le.fit_transform(data_1["weapon_code"])
mapping_weapon = {index: label for index, label in enumerate(le.classes_)}

data_1["premise_code"] = le.fit_transform(data_1["premise_code"])
mapping_premise_code = {index: label for index, label in enumerate(le.classes_)}

data_1["crime_code"] = le.fit_transform(data_1["crime_code"])
mapping_crime_code = {index: label for index, label in enumerate(le.classes_)}

data_1["status"] = le.fit_transform(data_1["status"])
mapping_status = {index: label for index, label in enumerate(le.classes_)}


def check(data):
    return data.isnull().sum().sum() == 0


# 进一步处理
def get_train_test_dataset(df_feature,df_label):
    # 类型
    feature = np.array(df_feature).astype(float)
    label = np.array(df_label)
    # 数据集划分
    features_train, features_test, labels_train, labels_test = train_test_split(feature, label, test_size=0.2, random_state=42)
    # 转换为PyTorch张量
    features_train = torch.tensor(features_train).float()
    labels_train =torch.LongTensor(labels_train) #torch.tensor(labels_train).float()
    features_test = torch.tensor(features_test).float()
    labels_test = torch.LongTensor(labels_test)

    # return features_train[:80000], features_test[:20000], labels_train[:80000], labels_test[:20000]
    return features_train, features_test, labels_train, labels_test


feature=[]
label=[]
if check(data_1):
    feature = data_1[
        [
            "month",
            "day",
            "area",
            "victim_age",
            "victim_sex",
            "victim_descent",
            "latitude",
            "longitude",
        ]
    ].copy()
    
    label = data_1["status"].copy()  # , "hour", "crime_code", "premise_code", "weapon_code", "status"
else:    
    raise ValueError("数据集存在空值")

# 获取训练集、测试集
features_train, features_test, labels_train, labels_test = get_train_test_dataset(feature,label)


# 定义多层感知器模型
class ImprovedMLP(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_rate):
        super(ImprovedMLP, self).__init__()

        self.hidden_layers = nn.ModuleList()
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            self.hidden_layers.append(nn.Linear(prev_dim, hidden_dim))
            self.hidden_layers.append(nn.ReLU())
            self.hidden_layers.append(nn.BatchNorm1d(hidden_dim))
            self.hidden_layers.append(nn.Dropout(dropout_rate))
            prev_dim = hidden_dim

        # 输出层
        self.output_layer = nn.Linear(hidden_dims[-1], output_dim)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        for layer in self.hidden_layers:
            x = layer(x)
        x = self.output_layer(x)
        # x = self.softmax(x)
        return x
    

# ### 创建 MLP 模型实例：
    

# 隐藏层和输出维度(不同目标标签，对应不同内容)

# # weapon_code
# hidden_dims = [16, 32, 64, 64]
# output_dim = 79  # 输出维度

# # premise_code
# hidden_dims = [16, 32, 64, 128, 256]
# output_dim = 305  # 输出维度

# # crime_code
# hidden_dims = [16, 32, 64, 128]
# output_dim = 138  # 输出维度

# status
hidden_dims = [12, 16, 12, 8]
output_dim = 6  # 输出维度

input_dim = 8  # 输入维度
learn_rate = 0.01  # 学习率
dropout_rate = 0.2  # 丢弃率
model = ImprovedMLP(input_dim, hidden_dims, output_dim,dropout_rate) # 模型

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(),lr=lr)
optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate)

# 创建训练数据集和数据加载器
train_dataset = TensorDataset(features_train, labels_train)
batch_size = 500  # 批处理大小
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(features_test, labels_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 训练循环
num_epochs = 100 # 训练迭代次数
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for vectors, labels in train_dataloader:

        optimizer.zero_grad()

        logits = model(vectors)

        loss = criterion(logits, labels)
        total_loss += loss.item() #

        loss.backward()
        optimizer.step()
        # total_loss += loss.item() 

    train_loss = total_loss / len(train_dataloader)

    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    # with torch.no_grad():
    for inputs, labels in test_dataloader:  # 假设test_dataloader是你的测试数据加载器
        # 测试数据
        logits = model(inputs)

        loss = criterion(logits, labels)
        total_loss += loss.item()
        _, predicted = torch.max(logits, dim=1)

        # print(len(predicted))
        correct += (predicted == labels).sum().item()
        total += labels.size(0)


    # 计算平均损失和准确率
    test_loss = total_loss / len(test_dataloader)
    test_accuracy = correct / total

    # 打印训练过程中的损失
    if epoch%29 == 1:
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {train_loss} - TestLoss: {test_loss:.4f} - Accuracy: {test_accuracy:.2f}")

