## 4.3 问题三

### 4.3.1 分析与设计

本作品从行业、一级行政区划、学历要求和经验要求四个维度分析职位的薪酬待遇。为了确定上述维度对于薪酬待遇的影响程度，本作品使用决策树模型进行分析。分析结果表明，上述维度对薪酬待遇影响程度由高到低依次为：学历要求，行业，经验要求，一级行政区划。

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from joblib import dump
from math import sqrt
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import re
import numpy as np
from math import ceil
from collections import defaultdict

In [18]:
df = pd.read_csv("data/final/FixedSalaryFinal.csv")

In [19]:
df.head(5)

Unnamed: 0,_id,city,salary,experience,education,company,companyType,salary_type,salary_min,salary_max,salary_base,province,is_outlier
0,0021d67ace2b055296445ba037bb8014vw,B383,4-6K·15薪,EKk,GP,company_453698,type_uTAWZv,B15,60000,90000,15,B,False
1,0021d67ace2b055296445ba037bb8014vw,D672,4-5K,EdD,GP,company_419070,type_uTAWZv,M,48000,60000,12,D,False
2,0021d67ace2b055296445ba037bb8014vw,D832,2-4K,EdD,GP,company_592101,type_uTAWZv,M,24000,48000,12,D,False
3,0021d67ace2b055296445ba037bb8014vw,E027,3-4K,EdD,GP,company_415791,type_uTAWZv,M,36000,48000,12,E,False
4,0021d67ace2b055296445ba037bb8014vw,E727,3-4K,EdD,Gy,company_822152,type_lwxCGv,M,36000,48000,12,E,False


### 预处理

In [20]:
df.sort_values(by=["province", "city"], inplace=True)
df["new_city_code"] = None

current_code = 0
previous_province = None
previous_citycode = None
for index, row in df.iterrows():
    if row["province"] != previous_province:
        current_code = 0
    if index:
        if previous_citycode != row["city"]:
            current_code += 1

    df.at[index, "new_city_code"] = current_code
    previous_province = row["province"]
    previous_citycode = row["city"]

In [21]:
counter = defaultdict(int)
for index, row in df.iterrows():
    company_type = row["companyType"]
    first_ = company_type[5:6]
    counter[first_] += 1

    df.at[index, "companyType_first"] = first_

In [22]:
df

Unnamed: 0,_id,city,salary,experience,education,company,companyType,salary_type,salary_min,salary_max,salary_base,province,is_outlier,new_city_code,companyType_first
435,02120b8a907537fee739216562384f27Lf,A050,5-9K,EdD,GI,company_495997,type_ZcPYrP,M,60000,108000,12,A,False,1,Z
714,0267d6bf768486fee9e135bb99d78821Km,A050,10-15K,Eqh,GI,company_824862,type_fFlUqG,M,120000,180000,12,A,False,1,f
3410,04c26f94437c683d665b894696218457go,A050,5-10K,EdD,GP,company_797398,type_ruszQK,M,60000,120000,12,A,False,1,r
3538,050716a1e3496566de850636d9e7eb8dUa,A050,3-4K,EdD,GP,company_779366,type_kCfcsh,M,36000,48000,12,A,False,1,k
3856,0508d7be415a54eba97a968f8dacd55chL,A050,5-10K,EKk,GP,company_522422,type_fFlUqG,M,60000,120000,12,A,False,1,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154876,edb8f555c04b654d9b005d76b63ffc76rZ,Z885,5-6K,EdD,GP,company_601372,type_qGLNuz,M,60000,72000,12,Z,False,15,q
155868,efaa2b8bd2a6b5ba1400b8f9d5a4c302ub,Z885,4-6K,EdD,Go,company_782797,type_XqvUzw,M,48000,72000,12,Z,False,15,X
159443,f59c44a5dd839c3438d134bd4c30a69bJx,Z885,4-9K,EdD,GI,company_682974,type_lOdYUb,M,48000,108000,12,Z,False,15,l
160742,f693da772b6ba497215104c0b3c3f571hM,Z885,4-5K,EdD,GP,company_144052,type_PCcrub,M,48000,60000,12,Z,False,15,P


In [23]:
data = df[df["is_outlier"]]

## Model

In [24]:
categorical_cols = [
    "_id",
    "experience",
    "education",
    "province",
    "companyType_first",
    "companyType",
]
integer_col = ["new_city_code"]

onehot_encoder = OneHotEncoder(handle_unknown="ignore")
label_encoder = LabelEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", onehot_encoder, categorical_cols),
    ],
    remainder="passthrough",
)

### 数据集划分

In [25]:
X = data[categorical_cols]
y = data["salary_min"]

* Origin

In [26]:
# # 划分训练集和测试集
# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # X_train = X_test = X
# # y_train = y_test = y

In [29]:
test_size = 0.2

X_train_list, y_train_list = [], []
X_test_list, y_test_list = [], []
X_columns = categorical_cols + integer_col
y_column = "salary_max"

grouped = data.groupby(["_id", "companyType_first", "province"])

rec_test_samples = []
for name, group in grouped:
    group_size = len(group)
    test_samples = max(1, ceil(test_size * group_size))
    rec_test_samples.append(test_samples)

    if test_samples == group_size:
        X_test_list.append(group[X_columns])
        y_test_list.append(group[y_column])
        X_train_list.append(group[X_columns])
        y_train_list.append(group[y_column])
    else:
        test_indices = group.sample(n=test_samples).index
        train_indices = group.index.difference(test_indices)

        X_test_list.append(group.loc[test_indices, X_columns])
        y_test_list.append(group.loc[test_indices, y_column])

        X_train_list.append(group.loc[train_indices, X_columns])
        y_train_list.append(group.loc[train_indices, y_column])

X_train = pd.concat(X_train_list, ignore_index=True)
y_train = pd.concat(y_train_list, ignore_index=True)
X_test = pd.concat(X_test_list, ignore_index=True)
y_test = pd.concat(y_test_list, ignore_index=True)

ValueError: No objects to concatenate

### 训练

In [15]:
model = DecisionTreeRegressor(random_state=10)

pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])

we heave test the following rondom_state:
- 42
- 10(_id, cpT_f, province)+categorical_cols = ['_id' ,'experience', 'education', 'province', 'companyType_first', 'companyType'] + integer_col = ['new_city_code'] 

In [16]:
pipeline.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [None]:
onehot_feature_names = pipeline[:-1].get_feature_names_out()

In [None]:
y_pred = pipeline.predict(X_test)
y_trainpred = pipeline.predict(X_train)

In [None]:
results_df = pd.DataFrame({"y_pred": y_pred, "y_test": y_test})  # 预测值列  # 真实值列

results_train = pd.DataFrame(
    {
        "y_pred": y_trainpred,
        "y_train": y_train,
    }
)

In [None]:
results_df.head(60)

## Overview

### 模型信息查看

In [None]:
model.feature_importances_

In [None]:
dfx = pd.DataFrame(model.feature_importances_)
dfx

### 结果可视化

In [None]:
mse = mean_squared_error(y_pred, y_test)
mse_train = mean_squared_error(y_trainpred, y_train)

show = [mse, mse_train]
show

[6029831358.9854765, 148836859.96051523]

[5195698107.9117565, 100187161.25651084]

[1882556360.7046676, 73682938.51298605]

----

[3642066377.8527784, 304594929.26954085]


In [None]:
allDataLength = len(y_pred)
indices = np.arange(1000)

# plt.figure(figsize=(10, 6))
# plt.bar(indices, results_df["y_test"].head(300), width=0.4, color='g', align='center')
# plt.bar(indices + 0.4, results_df["y_pred"].head(300), width=0.4, color='b', align='center')

# plt.show()

In [None]:
results_df["delta"] = results_df["y_pred"] - results_df["y_test"]

plt.figure(figsize=(10, 6))
plt.bar(indices, results_df["delta"].head(1000), width=0.4, color="r", align="center")
plt.show()

In [None]:
indices = np.arange(1000)

results_train["delta"] = results_train["y_pred"] - results_train["y_train"]

plt.figure(figsize=(10, 6))
plt.bar(
    indices, results_train["delta"].head(1000), width=0.4, color="r", align="center"
)

In [None]:
num_features_used_by_model = model.tree_.n_features
if len(onehot_feature_names) < num_features_used_by_model:
    print("特征名称列表不完整，请添加缺失的特征名称。")

In [None]:
dfx["feature"] = onehot_feature_names
dfx.columns = ["value", "feature"]

In [None]:
dfx_sorted = dfx.sort_values(by=["value"], ascending=False)
dfx_sorted.head(50)

In [None]:
counter = defaultdict(int)

features = ["_id", "education", "experience", "province", "new_city_code"]
for index, row in dfx.iterrows():
    for feature in features:
        if feature in row["feature"]:
            counter[feature] += row["value"]

    if "companyType_first" in row["feature"]:
        counter["companyType_first"] += row["value"]
    elif "companyType" in row["feature"]:
        counter["companyType"] += row["value"]

result = pd.DataFrame.from_dict(counter, orient="index", columns=["count"]).rename_axis(
    "Feature"
)
result.columns = ["calc"]

In [None]:
plt.figure(figsize=(15, 10))

colors = ["#f0d9e6", "#e6c2d9", "#d9a9cc", "#cca0bf"]

painter_name = ["experience", "education", "province", "companyType"]
painter = [
    result.calc["experience"],
    result.calc["education"],
    result.calc["province"],
    result.calc["companyType"],
]
painterSum = 0
for x in painter:
    painterSum += x

for i in range(len(painter)):
    painter[i] = painter[i] / painterSum

painter

In [None]:
plt.figure(figsize=(15, 10))

colors = ["#f0d9e6", "#e6c2d9", "#cca0bf", "#a5708c"]

pie = plt.pie(painter, autopct="%1.3f%%", startangle=140, colors=colors)
plt.legend(pie[0], painter_name, loc="best")

plt.show()

In [None]:
plt.figure(figsize=(15, 10))

colors = ["#f0d9e6", "#e6c2d9", "#d9a9cc", "#cca0bf", "#bf88a5", "#a5708c"]

pie = plt.pie(result["count"], autopct="%1.3f%%", startangle=140, colors=colors)
legend_list = result.index.to_list()
legend_list[-1] = "city"
legend_list[0] = "job_titles"
plt.legend(pie[0], legend_list, loc="best")

plt.show()

In [None]:
legend_list

In [None]:
plt.figure(figsize=(15, 10))
plot_tree(model, feature_names=onehot_feature_names, filled=True)
plt.show()