In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False

%config InlineBackend.figure_format = 'svg'

In [2]:
job = pd.read_csv('data/job.csv')
job.head()

Unnamed: 0,job_name,company_name,companytype_text,providesalary_text,workarea_text,jobwelf,updatedate,attribute_text
0,战略专家,美团,上市公司,,北京-朝阳区,,10-14,"['北京-朝阳区', '5-7年经验', '招1人']"
1,45352-研发项目管理组长,深圳市腾讯计算机系统有限公司,民营公司,,深圳,,10-14,"['深圳', '3-4年经验', '本科', '招1人']"
2,策略专员-账号,腾讯科技（北京）有限公司,民营公司,1-2万/月,北京-海淀区,六险一金 餐饮补贴 交通补贴 专业培训 年终奖金 班次津贴,10-14,"['北京-海淀区', '1年经验', '本科', '招8人']"
3,销售助理,新浪上海,合资,6-8千/月,上海-静安区,五险一金 补充公积金 节日福利 定期体检,10-14,"['上海-静安区', '无需经验', '本科', '招若干人']"
4,城市经理,美团,上市公司,3-4.5万/月,天津,,10-14,"['天津', '5-7年经验', '大专', '招1人']"


In [3]:
job.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   job_name            2500 non-null   object
 1   company_name        2500 non-null   object
 2   companytype_text    2500 non-null   object
 3   providesalary_text  2496 non-null   object
 4   workarea_text       2500 non-null   object
 5   jobwelf             2083 non-null   object
 6   updatedate          2500 non-null   object
 7   attribute_text      2500 non-null   object
dtypes: object(8)
memory usage: 156.4+ KB


## 数据清洗

In [4]:
# 删除薪资为空的数据
null_index = job[job.providesalary_text.isnull()].index

In [5]:
job.drop(null_index, inplace=True)

In [6]:
job.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2496 entries, 2 to 2499
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   job_name            2496 non-null   object
 1   company_name        2496 non-null   object
 2   companytype_text    2496 non-null   object
 3   providesalary_text  2496 non-null   object
 4   workarea_text       2496 non-null   object
 5   jobwelf             2082 non-null   object
 6   updatedate          2496 non-null   object
 7   attribute_text      2496 non-null   object
dtypes: object(8)
memory usage: 175.5+ KB


In [7]:
job.reset_index(inplace=True)

In [None]:
job.providesalary_text.unique()

In [9]:
# 删除异常值（有几个数据薪资是以日/小时、天/每小时计算的，需要删除）
delete_index = []
for i, salary in enumerate(job.providesalary_text.tolist()):
    if salary.endswith('天') or salary.endswith('小时'):
        delete_index.append(i)
delete_index

[880, 1902, 2047, 2052, 2251, 2371]

In [10]:
job.iloc[delete_index, :]

Unnamed: 0,index,job_name,company_name,companytype_text,providesalary_text,workarea_text,jobwelf,updatedate,attribute_text
880,884,运营助理（***，实习）,上海风许网络科技有限公司,民营公司,150元/天,上海-闵行区,五险一金 交通补贴 餐饮补贴 专业培训 年终奖金 弹性工作,10-14,"['上海-闵行区', '无需经验', '大专', '招2人']"
1902,1906,客服专员（调度team）深夜班,建顺信息科技（广州）有限公司,外资（非欧美）,26元/小时,广州-天河区,弹性工作 专业培训,10-14,"['广州-天河区', '无需经验', '中专', '招2人']"
2047,2051,产品修图,成都部落窝科技有限公司,民营公司,120元/小时,成都-成华区,绩效奖金 弹性工作,10-14,"['成都-成华区', '2年经验', '大专', '招2人']"
2052,2056,电商平台上架产品,珠海市鲜茗叶茶业有限公司,创业公司,110元/天,中山-坦洲镇,,10-14,"['中山-坦洲镇', '无需经验', '招1人']"
2251,2255,天猫直播客服,优盘联队（苏州）电子商务有限公司,民营公司,25元/小时,苏州-工业园区,,10-14,"['苏州-工业园区', '无需经验', '高中', '招5人']"
2371,2375,售前客服,极易电商,民营公司,120元/天,苏州-吴中区,五险一金 员工旅游 绩效奖金 弹性工作 定期体检,10-14,"['苏州-吴中区', '无需经验', '招40人']"


In [11]:
job.drop(delete_index, inplace=True)

## 数据预处理

In [14]:
# 添加省份列
job['province'] = job.workarea_text.apply(lambda x: x.split('-')[0])

In [15]:
# 读取省份城市对照表
province_city = pd.read_csv('data/province_city.csv')
province_city[' 城市'] = province_city[' 城市'].apply(lambda x: x.lstrip(' '))

In [16]:
province_city.rename(columns={'省份': '省份', ' 城市': '城市'}, inplace=True)

In [17]:
province_city_dict = dict(zip(province_city['城市'].tolist(),province_city['省份'].tolist()))

In [18]:
# 将城市替换为对应省份
def replace_province(x):
    if x.endswith('省') or x in ['北京', '上海', '重庆', '天津']:
        return x
    else:
        return province_city_dict[x]

In [19]:
job['province'] = job.province.apply(replace_province)

In [107]:
def handle_province(x):
    if x.endswith('省'):
        return x.rstrip('省')
    else:
        return x

In [108]:
# 处理省份列
job.province = job.province.apply(handle_province)

In [109]:
job.head()

Unnamed: 0,index,job_name,company_name,companytype_text,providesalary_text,workarea_text,jobwelf,updatedate,attribute_text,province,salary
0,2,策略专员-账号,腾讯科技（北京）有限公司,民营公司,1-2万/月,北京-海淀区,六险一金 餐饮补贴 交通补贴 专业培训 年终奖金 班次津贴,10-14,"['北京-海淀区', '1年经验', '本科', '招8人']",北京,15.0
1,3,销售助理,新浪上海,合资,6-8千/月,上海-静安区,五险一金 补充公积金 节日福利 定期体检,10-14,"['上海-静安区', '无需经验', '本科', '招若干人']",上海,7.0
2,4,城市经理,美团,上市公司,3-4.5万/月,天津,,10-14,"['天津', '5-7年经验', '大专', '招1人']",天津,37.5
3,5,国企高薪诚聘电子商务,太平洋保险在线服务科技有限公司山东分公司,国企,4-8千/月,淄博,五险一金 交通补贴 餐饮补贴 年终奖金 绩效奖金 企业年金 商业保险 周末双休 职工餐厅 带薪培训,10-14,"['淄博', '无需经验', '大专', '招20人']",山东,6.0
4,6,市场营销经理,网易,上市公司,2-3.5万/月,上海-浦东新区,五险一金 包吃 包三餐 带薪年假 交通补贴 定期体检 商业保险 年终奖金,10-14,"['上海-浦东新区', '5-7年经验', '本科', '招1人']",上海,27.5


In [82]:
# 处理薪资列
def handle_salary(x):
    min_salary, max_salary = float(x.split('-')[0]), float(x.split('-')[1][:-3])
    if x.split('-')[1].endswith('万/月'):
        min_salary *= 10
        max_salary *= 10
    return (min_salary + max_salary) / 2

In [88]:
# 添加岗位平均薪资列
job['salary'] = job.providesalary_text.apply(handle_salary)

In [111]:
temp = job.province.value_counts()
temp = temp.reset_index()
temp

Unnamed: 0,index,province
0,广东,1084
1,上海,369
2,浙江,242
3,江苏,197
4,湖北,116
5,四川,90
6,北京,53
7,湖南,48
8,安徽,42
9,河南,32


## 数据透视和可视化

### 岗位标题词云

In [124]:
# 读入停用词
with open('data/stopword.txt', 'r', encoding='utf-8') as file:
    stopword_list = [word.strip('\n') for word in file.readlines()]

In [125]:
import jieba

content = ''.join(job['job_name'])
# 分词
words = jieba.lcut(content)
words = [word for word in words if word not in stopword_list]

In [129]:
# 绘制词云图
from wordcloud import WordCloud
from PIL import Image

txt = ' '.join(words)
mask = np.array(Image.open('image/China_map.jpg'))
# 创建词云对象
wc = WordCloud(
    font_path='font/SimHei.ttf',
    mask=mask,
    width=1200,
    height=800,
    background_color='white',
    max_words=100
)
wc.generate(txt)
wc.to_file('result1.png')

<wordcloud.wordcloud.WordCloud at 0x1e105d4c2b0>

<img src="result1.png">

### 岗位热力地图

In [113]:
from pyecharts import options as opts
from pyecharts.charts import Map
from pyecharts.faker import Faker
from pyecharts.globals import ThemeType

pieces = [
    {"min": 0, "max": 30},
    {"min": 30, "max": 60},
    {"min": 60, "max": 100},
    {"min": 100, "max": 150},
    {"min": 150, "max": 300},
    {"min": 300, "max": 400},
    {"min": 400}
]


map = Map(init_opts=opts.InitOpts(theme=ThemeType.WONDERLAND))
map.add("", [list(z) for z in zip(list(temp['index']), list(temp['province']))], "china")
map.set_global_opts(
    title_opts=opts.TitleOpts(title="岗位地区分布"),
    visualmap_opts=opts.VisualMapOpts(max_=400,
                                      is_piecewise=True,
                                     pieces=pieces),
)

map.render_notebook()

In [137]:
# 省份平均薪资Top-10
temp = pd.pivot_table(job, index='province', values='salary', aggfunc='mean').nlargest(10, 'salary')
temp = temp.applymap(lambda x: round(x, 2))

In [141]:
from pyecharts import options as opts
from pyecharts.charts import Bar


b = Bar()
b.add_xaxis(
    temp.index.tolist()
)
b.add_yaxis("", temp.salary.tolist())

b.set_global_opts(
    title_opts=opts.TitleOpts(title="省份平均薪资Top-10", subtitle=""),
    yaxis_opts=opts.AxisOpts(
            name="平均薪资",
            type_="value",
            min_=0,
            max_=15,
            interval=3,
            axislabel_opts=opts.LabelOpts(formatter="{value} k"),
            axistick_opts=opts.AxisTickOpts(is_show=True),
            splitline_opts=opts.SplitLineOpts(is_show=True),
        ),
)
b.render_notebook()


In [240]:
# 前五位公司聚集地
temp = pd.pivot_table(job, index='province', values='companytype_text', aggfunc='count')
temp = temp.nlargest(5, columns='companytype_text')
temp

Unnamed: 0_level_0,companytype_text
province,Unnamed: 1_level_1
广东,1084
上海,369
浙江,242
江苏,197
湖北,116


In [241]:
temp1 = job.loc[job['province'].isin(temp.index.tolist())]

In [242]:
temp = pd.pivot_table(temp1, index='province', columns='companytype_text', values='company_name', aggfunc='count')
temp.fillna(0, inplace=True)
temp

companytype_text,上市公司,事业单位,创业公司,合资,国企,外企代表处,外资（欧美）,外资（非欧美）,民营公司,非营利组织
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
上海,21.0,0.0,4.0,26.0,6.0,1.0,11.0,20.0,280.0,0.0
广东,28.0,1.0,33.0,28.0,11.0,1.0,13.0,30.0,938.0,1.0
江苏,9.0,0.0,4.0,13.0,5.0,0.0,6.0,6.0,154.0,0.0
浙江,4.0,0.0,7.0,3.0,1.0,0.0,3.0,5.0,219.0,0.0
湖北,7.0,0.0,3.0,1.0,3.0,0.0,2.0,3.0,97.0,0.0


In [243]:
temp = temp.div(temp.sum(axis=1), axis=0)
temp = temp.applymap(lambda x: round(x, 3))

In [244]:
temp

companytype_text,上市公司,事业单位,创业公司,合资,国企,外企代表处,外资（欧美）,外资（非欧美）,民营公司,非营利组织
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
上海,0.057,0.0,0.011,0.07,0.016,0.003,0.03,0.054,0.759,0.0
广东,0.026,0.001,0.03,0.026,0.01,0.001,0.012,0.028,0.865,0.001
江苏,0.046,0.0,0.02,0.066,0.025,0.0,0.03,0.03,0.782,0.0
浙江,0.017,0.0,0.029,0.012,0.004,0.0,0.012,0.021,0.905,0.0
湖北,0.06,0.0,0.026,0.009,0.026,0.0,0.017,0.026,0.836,0.0


### 前五位公司聚集地各类型公司发展情况

In [248]:
d = Bar()
d.add_xaxis(temp.index.tolist())
for index, i in enumerate(temp.columns):
    d.add_yaxis(temp.columns[index], temp[i].tolist(), stack="stack1")
d.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
d.set_global_opts(title_opts=opts.TitleOpts(title=""))
d.render_notebook()
