# Preparation

In [1]:
import os
from hanlp_restful import HanLPClient
import pandas as pd
from docx import Document
from tqdm import tqdm
import time
import ast
from stylecloud import gen_stylecloud

import seaborn as sns

In [2]:
# file_list = os.listdir("data/Interview transcripts-round 3")
file_list = ['1_SZUZJY_来深33年邹先生.docx',
 '1_SZUZJY_深圳本地人林女士.docx',
 '2_JNU LLW_DB.docx',
 '2_JNU LLW_LXS.docx',
 '2_JNU LLW_RRAY.docx',
 '2_JNU LLW_SYJZ.docx',
 '2_JNU LLW_XBB.docx',
 '3_SZTUWWX_李先生.docx',
 '3_SZTUWWX_许女士.docx',
 '3_SZTUWWX_谭女士霍先生.docx',
 '3_SZTUWWX_邓先生王女士.docx',
 '3_SZTUWWX_陈女士.docx',
 '4_UT_LJY_UU.docx',
 '4_UT_LJY_WW.docx',
 '4_UT_LJY_XW.docx',
 '5_CS+刘阿姨.docx',
 '5_CS+尤老师.docx',
 '5_CS+钟老师.docx',
 '5_CS+黄先生.docx',
 '6_szulys_nw.docx',
 '6_szulys_乔麦.docx',
 '7_SZUF_LT.docx',
 '7_SZUF_坚.docx',
 '7_SZUF_大花.docx',
 '8_hkust_wdq_q.docx',
 '8_hkust_wdq_yyq.docx',
 '9_SZUCJH_发哥.docx',
 '9_SZUCJH_小贾.docx',
 '9_SZUCJH_小陈.docx',
 '9_SZUCJH_老陈.docx',
 '10_SZTU-lx_何爷爷.docx',
 '10_SZTU-lx_凹凸曼&李先生.docx',
 '11_SZUH_LEI.docx',
 '11_SZUH_YUNWEI.docx',
 '12_SZUL_Roson.docx',
 '12_SZUL_夏天.docx',
 '12_SZUL_张律师.docx',
 '12_SZUL_美丽.docx',
 '12_SZUL_花生.docx',
 '13_SZTULJK_HSF.docx',
 '13_SZTULJK_HYZ.docx',
 '13_SZTULJK_LDH.docx',
 '13_SZTULJK_LJP.docx',
 '14_SZUlsr-jiachunxia.docx',
 '14_SZUlsr-wuxin.docx',
 '15_PKU+DYE_Suning.docx',
 '15_PKU+DYE_Xin.docx',
 '15_PKU+DYE_Xuan.docx',
 '17_SCNUC_小于先生.docx',
 '17_SCNUC_小于女士.docx',
 '18_SZU_XZA_HE.docx',
 '19_sustech_qk_He.docx',
 '19_sustech_qk_meng.docx',
]

# Hanlp

In [3]:
# HanLP = HanLPClient('https://www.hanlp.com/api', auth='你申请到的auth')  # auth需要申请
HanLP = HanLPClient('https://www.hanlp.com/api', auth="NTc3MkBiYnMuaGFubHAuY29tOnV6R0xMS05pblB3c29CZE4=", language='zh')

In [4]:
# tokenize(text: Union[str, List[str]], coarse: Optional[bool] = None, language=None) → List[List[str]]
# seg = HanLP.tokenize(text)
# seg

In [5]:
def read_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return ''.join(full_text)

# def get_location_from_one_hanlp(my_filepath):
#     text = read_docx(my_filepath)
#     # display(text)

#     doc = HanLP(text, tasks='ner/pku', language='zh')
#     my_ner_list = doc['ner/pku']

#     my_filename = my_filepath.split("/")[-1].split(".")[0]
#     return my_ner_list, my_filename

def split_text(text, max_length=15000):
    # 使用\n分割文本为句子列表
    sentences = text.split('\n')
    parts = []
    current_part = ""

    for sentence in sentences:
        # 检查加入当前句子是否会超过最大长度
        if len(current_part) + len(sentence) + 1 > max_length:  # +1 是为了计算加入的换行符
            # 如果会超过，当前部分结束，开始新的部分
            parts.append(current_part)
            current_part = sentence  # 开始新的部分，当前句子是新部分的第一句
        else:
            # 如果不会超过，将当前句子加入当前部分
            if current_part:  # 如果当前部分不为空，先加入换行符
                current_part += '\n'
            current_part += sentence

    # 循环结束后，将最后一部分（如果有）加入到部分列表中
    if current_part:
        parts.append(current_part)

    return parts

def get_location_from_one_hanlp(my_filepath):
    text = read_docx(my_filepath)
    # display(text)

    # 检查文本长度并分割
    texts = split_text(text) if len(text) > 15000 else [text]

    # 初始化空列表来存储所有部分的结果
    all_ner_lists = []

    # 对每个文本部分调用 API 并合并结果
    for part_text in texts:
        doc = HanLP(part_text, tasks='ner/pku', language='zh')
        my_ner_list = doc['ner/pku']
        all_ner_lists.extend(my_ner_list)

    my_filename = my_filepath.split("/")[-1].split(".")[0]
    return all_ner_lists, my_filename

def get_text_length(my_filepath):
    text = read_docx(my_filepath)
    # print(len(text))
    return len(text)

In [6]:
# doc.pretty_print()

In [7]:
# # file_path = "data/Interview transcripts-round 3/1_SZUZJY_来深33年邹先生.docx"
# # range_list = [0, 2, 7, 12, 15, 19, 22, 27, 29, 31, 33, 35, 40, 45, 48, 51]
# # range_list = range(0, 55)
# # range_list = [52]
# range_list = range(len(file_list))
# for i in tqdm(range_list):
#     my_filepath = f"data/Interview transcripts-round 3/{file_list[i]}"
#     # tmp = split_text(read_docx(my_filepath))
#     ner_list, filename = get_location_from_one_hanlp(my_filepath)
#     # display(ner_list)
#     with open(f"data/output_hanlp+worldcloud/txt/{filename}.txt","w") as f:
#         f.write("[\n")
#         for item in ner_list:
#             f.write("%s\n" % item)
#         f.write("]")
#     if i == 25:
#         time.sleep(60)

# Convert list to df

For each word, we calculate its frequency.

In [8]:
def convert_one_list_todf(my_filename):
    with open(f"data/output_hanlp+worldcloud/txt/{my_filename}.txt", "r") as f:
        content = f.read()
        tmp_ner_list = [ast.literal_eval(i.strip()) for i in content[1:-1].strip().split("\n")]
    result =[[item[0], (index, item[2],item[3])] for index, sublist in enumerate(tmp_ner_list) for item in sublist if item[1] == 'ns']
    df = pd.DataFrame(result, columns=['location_name', 'value']).groupby("location_name").agg({"value": lambda x: list(x)}).reset_index()
    df["count"] = df["value"].apply(lambda x: len(x))
    df["filename"] = my_filename
    df.to_csv(f"data/output_hanlp+worldcloud/csv/{my_filename}.csv", index=False)

In [9]:
# range_list = range(len(file_list))
# for i in tqdm(range_list):
#     my_filename = file_list[i][0:-5]
#     convert_one_list_todf(my_filename)

Combine all csv files together

In [10]:
import pandas as pd
import glob
# 使用glob模块找到所有的csv文件
csv_files = glob.glob("data/output_hanlp+worldcloud/csv/*.csv")
# 读取每个csv文件并将它们存储在一个列表中
dfs = [pd.read_csv(file) for file in csv_files]
# 使用concat函数合并所有的DataFrame
df_combine = pd.concat(dfs, ignore_index=True)
df_combine

Unnamed: 0,location_name,value,count,filename
0,上沙,"[(70, 10, 11)]",1,10_SZTU-lx_何爷爷
1,东莞,"[(262, 50, 51)]",1,10_SZTU-lx_何爷爷
2,东门,"[(63, 6, 7), (211, 47, 48), (212, 3, 4), (227,...",5,10_SZTU-lx_何爷爷
3,东门老街,"[(8, 6, 8), (8, 9, 11)]",2,10_SZTU-lx_何爷爷
4,丹竹头,"[(70, 8, 9)]",1,10_SZTU-lx_何爷爷
...,...,...,...,...
2598,香港,"[(52, 40, 41)]",1,9_SZUCJH_老陈
2599,龙华,"[(53, 4, 5)]",1,9_SZUCJH_老陈
2600,龙华区,"[(16, 36, 37), (21, 18, 19), (53, 5, 6), (102,...",4,9_SZUCJH_老陈
2601,龙岗,"[(21, 49, 50), (22, 2, 3), (55, 36, 37)]",3,9_SZUCJH_老陈


Combine with the given data

In [None]:
df_combine["filename_noindex"] = df_combine["filename"].str.split("_").apply(lambda x: "_".join(x[1:]))
display(df_combine)

df_info = pd.read_csv("data/output_hanlp+worldcloud/round3_info.csv")
display(df_info.head(5))
df_info_select = df_info[["访谈编号","性别","出生年份","关内关外"]]

In [None]:
df_merge = pd.merge(df_combine, df_info_select, left_on='filename_noindex', right_on='访谈编号', how='left')
df_merge

# Data preprocess

In [None]:
## only remove shenzen 
df_merge_remove_sz = df_merge.copy(deep=True)
df_merge_remove_sz = df_merge_remove_sz.loc[(df_merge_remove_sz["location_name"]!= "深圳")&(df_merge_remove_sz["location_name"]!= "深圳市"),:]
display(df_merge_remove_sz)

# ## remove district as well as shenzhen
# df_merge_remove_sz_district = df_merge_remove_sz.copy(deep=True)
# df_merge_remove_sz_district = df_merge_remove_sz_district.loc[(df_merge_remove_sz_district["location_name"]!= "罗湖区")&(df_merge_remove_sz_district["location_name"]!= "福田区")&(df_merge_remove_sz_district["location_name"]!= "南山区")&(df_merge_remove_sz_district["location_name"]!= "宝安区")&(df_merge_remove_sz_district["location_name"]!= "龙岗区")&(df_merge_remove_sz_district["location_name"]!= "盐田区")&(df_merge_remove_sz_district["location_name"]!= "龙华区")&(df_merge_remove_sz_district["location_name"]!= "坪山区")&(df_merge_remove_sz_district["location_name"]!= "光明区")&
# (df_merge_remove_sz_district["location_name"]!= "罗湖")&(df_merge_remove_sz_district["location_name"]!= "福田")&(df_merge_remove_sz_district["location_name"]!= "南山")&(df_merge_remove_sz_district["location_name"]!= "宝安")&(df_merge_remove_sz_district["location_name"]!= "龙岗")&(df_merge_remove_sz_district["location_name"]!= "盐田")&(df_merge_remove_sz_district["location_name"]!= "龙华")&(df_merge_remove_sz_district["location_name"]!= "坪山")&(df_merge_remove_sz_district["location_name"]!= "光明"),:]
# display(df_merge_remove_sz_district)

# WordCloud

In [None]:
my_stop_list = ["南山区","南山","坪山","坪山区","光明","光明区","盐田","盐田区","龙华","龙华区","龙岗","龙岗区","福田","福田区","罗湖","罗湖区","宝安","宝安区"]

## For all

In [None]:
df_all = df_merge_remove_sz.groupby("location_name").agg({"count":"sum"}).reset_index()
display(df_all.sort_values("count", ascending=False))

In [None]:
noun_freq_all = list(df_all.itertuples(index=False, name=None))
pd.DataFrame(noun_freq_all, columns=["word","freq"]).to_csv("data/output_hanlp+worldcloud/wordcloud/corresponding_table/all.csv", index=False)
# gen_stylecloud(
# 	text = dict(noun_freq_all),                    # {(word1, freq1),(word2, freq2), …}
# 	size=(1500,1500),         # 词云图的长宽，设置更大的数字可以增加成图的分辨率，但代码运行时间会随之增加
# 	max_words=1000,          # 词云图中的最大词语数量
# 	max_font_size=300,      # 词云图中字号的最大值
# 	font_path=r'C:\Windows\Fonts\SimHei.ttf', # 字体是必要的参数，否则中文会显示异常
# 	palette='colorbrewer.diverging.Spectral_11',
# 	# colors=['#ecf0f1', '#3498db', '#e74c3c'],                # 不想用palette可以自定义颜色
# 	output_name='data/output_hanlp+worldcloud/wordcloud/all.png',       # 必要参数，保存词云图的路径
# 	background_color='black',
# 	# icon_name= "fas fa-comment-alt" , # comment图标
# 	# icon_name= "fas fa-circle" ,   # 圆形图标
# 	icon_name = "fas fa-square-full",  # 正方形图标
# 	collocations = False,
# 	invert_mask=False,                 # 形状反转（在画布里icon之外的地方绘图）
#  	# custom_stopwords=my_stop_list,
# 	random_state = 42
# )

# ## remove district
# gen_stylecloud(
# 	text = dict(noun_freq_all),                    # {(word1, freq1),(word2, freq2), …}
# 	size=(1500,1500),         # 词云图的长宽，设置更大的数字可以增加成图的分辨率，但代码运行时间会随之增加
# 	max_words=1000,          # 词云图中的最大词语数量
# 	max_font_size=300,      # 词云图中字号的最大值
# 	font_path=r'C:\Windows\Fonts\SimHei.ttf', # 字体是必要的参数，否则中文会显示异常
# 	palette='colorbrewer.diverging.Spectral_11',
# 	# colors=['#ecf0f1', '#3498db', '#e74c3c'],                # 不想用palette可以自定义颜色
# 	output_name='data/output_hanlp+worldcloud/wordcloud/all_rm_district.png',       # 必要参数，保存词云图的路径
# 	background_color='black',
# 	# icon_name= "fas fa-comment-alt" , # comment图标
# 	# icon_name= "fas fa-circle" ,   # 圆形图标
# 	icon_name = "fas fa-square-full",  # 正方形图标
# 	collocations = False,
# 	invert_mask=False,                 # 形状反转（在画布里icon之外的地方绘图）
#  	custom_stopwords=my_stop_list,
# 	random_state = 42
# )


## 性别（男-女）


把性别为 “一男一女” 转化为 “男” 或者 “女”，即保留两份数据 

In [None]:
df_gender = df_merge_remove_sz.copy(deep=True)
df_gender_split = df_gender[df_gender['性别'] == '一男一女'].copy()

# 创建两个副本，一个性别为“男”，另一个为“女”
df_male = df_gender_split.copy()
df_female = df_gender_split.copy()
df_male['性别'] = '男'
df_female['性别'] = '女'

# concat函数将三个数据框连接在一起，删除性别为“一男一女”的数据
df_gender = pd.concat([df_gender, df_male, df_female])
df_gender = df_gender[df_gender['性别'] != '一男一女']
df_gender.reset_index(drop=True, inplace=True)

display(df_gender)

In [None]:
df_gender_preprocess = df_gender.groupby(["性别","location_name"]).agg({"count":"sum"}).reset_index()
display(df_gender_preprocess)

### man

In [None]:
noun_freq_man = list(df_gender_preprocess.loc[df_gender_preprocess["性别"]=="男",["location_name","count"]].itertuples(index=False, name=None))

pd.DataFrame(noun_freq_man,columns=["word","freq"]).to_csv("data/output_hanlp+worldcloud/wordcloud/corresponding_table/gender_man.csv", index=False)


# gen_stylecloud(
# 	text = dict(noun_freq_man),                    # {(word1, freq1),(word2, freq2), …}
# 	size=(1500,1500),         # 词云图的长宽，设置更大的数字可以增加成图的分辨率，但代码运行时间会随之增加
# 	max_words=1000,          # 词云图中的最大词语数量
# 	max_font_size=300,      # 词云图中字号的最大值
# 	font_path=r'C:\Windows\Fonts\SimHei.ttf', # 字体是必要的参数，否则中文会显示异常
# 	palette='colorbrewer.diverging.Spectral_11',
# 	# colors=['#ecf0f1', '#3498db', '#e74c3c'],                # 不想用palette可以自定义颜色
# 	output_name='data/output_hanlp+worldcloud/wordcloud/gender_man.png',       # 必要参数，保存词云图的路径
# 	background_color='black',
# 	# icon_name= "fas fa-comment-alt" , # comment图标
# 	icon_name= "fas fa-circle" ,   # 圆形图标
# 	# icon_name = "fas fa-square-full",  # 正方形图标
# 	collocations = False,
# 	invert_mask=False,                 # 形状反转（在画布里icon之外的地方绘图）
#  	# custom_stopwords=my_stop_list,
# 	random_state = 42
# )

# ## remove district
# gen_stylecloud(
# 	text = dict(noun_freq_man),                    # {(word1, freq1),(word2, freq2), …}
# 	size=(1500,1500),         # 词云图的长宽，设置更大的数字可以增加成图的分辨率，但代码运行时间会随之增加
# 	max_words=1000,          # 词云图中的最大词语数量
# 	max_font_size=300,      # 词云图中字号的最大值
# 	font_path=r'C:\Windows\Fonts\SimHei.ttf', # 字体是必要的参数，否则中文会显示异常
# 	palette='colorbrewer.diverging.Spectral_11',
# 	# colors=['#ecf0f1', '#3498db', '#e74c3c'],                # 不想用palette可以自定义颜色
# 	output_name='data/output_hanlp+worldcloud/wordcloud/gender_man_rm_district.png',       # 必要参数，保存词云图的路径
# 	background_color='black',
# 	# icon_name= "fas fa-comment-alt" , # comment图标
# 	icon_name= "fas fa-circle" ,   # 圆形图标
# 	# icon_name = "fas fa-square-full",  # 正方形图标
# 	collocations = False,
# 	invert_mask=False,                 # 形状反转（在画布里icon之外的地方绘图）
#  	custom_stopwords=my_stop_list,
# 	random_state = 42
# )


### woman

In [None]:
noun_freq_woman = list(df_gender_preprocess.loc[df_gender_preprocess["性别"]=="女",["location_name","count"]].itertuples(index=False, name=None))

pd.DataFrame(noun_freq_woman,columns=["word","freq"]).to_csv("data/output_hanlp+worldcloud/wordcloud/corresponding_table/gender_woman.csv", index=False)


# gen_stylecloud(
# 	text = dict(noun_freq_woman),                    # {(word1, freq1),(word2, freq2), …}
# 	size=(1500,1500),         # 词云图的长宽，设置更大的数字可以增加成图的分辨率，但代码运行时间会随之增加
# 	max_words=1000,          # 词云图中的最大词语数量
# 	max_font_size=300,      # 词云图中字号的最大值
# 	font_path=r'C:\Windows\Fonts\SimHei.ttf', # 字体是必要的参数，否则中文会显示异常
# 	# palette='colorbrewer.diverging.Spectral_11',
# 	# colors=['#ecf0f1', '#3498db', '#e74c3c'],                # 不想用palette可以自定义颜色
# 	output_name='data/output_hanlp+worldcloud/wordcloud/gender_woman.png',       # 必要参数，保存词云图的路径
# 	background_color='white',
# 	# icon_name= "fas fa-comment-alt" , # comment图标
# 	icon_name= "fas fa-circle" ,   # 圆形图标
# 	# icon_name = "fas fa-square-full",  # 正方形图标
# 	collocations = False,
# 	invert_mask=False,                 # 形状反转（在画布里icon之外的地方绘图）
#  	# custom_stopwords=my_stop_list,
# 	random_state = 42
# )

# ## remove district
# gen_stylecloud(
# 	text = dict(noun_freq_woman),                    # {(word1, freq1),(word2, freq2), …}
# 	size=(1500,1500),         # 词云图的长宽，设置更大的数字可以增加成图的分辨率，但代码运行时间会随之增加
# 	max_words=1000,          # 词云图中的最大词语数量
# 	max_font_size=300,      # 词云图中字号的最大值
# 	font_path=r'C:\Windows\Fonts\SimHei.ttf', # 字体是必要的参数，否则中文会显示异常
# 	# palette='colorbrewer.diverging.Spectral_11',
# 	# colors=['#ecf0f1', '#3498db', '#e74c3c'],                # 不想用palette可以自定义颜色
# 	output_name='data/output_hanlp+worldcloud/wordcloud/gender_woman_rm_district.png',       # 必要参数，保存词云图的路径
# 	background_color='white',
# 	# icon_name= "fas fa-comment-alt" , # comment图标
# 	icon_name= "fas fa-circle" ,   # 圆形图标
# 	# icon_name = "fas fa-square-full",  # 正方形图标
# 	collocations = False,
# 	invert_mask=False,                 # 形状反转（在画布里icon之外的地方绘图）
#  	custom_stopwords=my_stop_list,
# 	random_state = 42
# )



## 年龄（70年代，80年代、90年代、20世纪）


处理 一份访谈中出现两个出生年份的 数据

In [None]:
df_age = df_merge_remove_sz.copy(deep=True)
df_age_split = df_age.loc[df_age["出生年份"].apply(lambda x: len(x)) >4,:].copy()
df_age_split_cl = df_age_split.loc[df_age_split["出生年份"] == "陈女士1978 李先生1976",:].copy(deep = True)
df_age_split_dw = df_age_split.loc[df_age_split["出生年份"] == "邓先生1991，王女士1968",:].copy(deep = True)
display(df_age_split_cl.head(2))
display(df_age_split_dw.head(2))

df_age_split_cl_1978 = df_age_split_cl.copy(deep=True)
df_age_split_cl_1978['出生年份'] = '1978'
df_age_split_cl_1976 = df_age_split_cl.copy(deep=True)
df_age_split_cl_1976['出生年份'] = '1976'

df_age_split_dw_1991 = df_age_split_dw.copy(deep=True)
df_age_split_dw_1991['出生年份'] = '1991'
df_age_split_dw_1968 = df_age_split_dw.copy(deep=True)
df_age_split_dw_1968['出生年份'] = '1968'

df_age = pd.concat([df_age, df_age_split_cl_1978, df_age_split_cl_1976, df_age_split_dw_1991, df_age_split_dw_1968])
df_age = df_age.loc[df_age["出生年份"].apply(lambda x: len(x)) == 4,:].copy()
df_age.reset_index(drop=True, inplace=True)
display(df_age)

将出生年份离散化

In [None]:
df_age["age_group"] = pd.cut(df_age["出生年份"].astype(int), bins=[1940, 1970, 1980, 1990, 2000, 2010], labels=["leq70s","70s", "80s", "90s", "00s"],right=False)
df_age_preprocess = df_age.groupby(["age_group","location_name"]).agg({"count":"sum"}).reset_index()
df_age_preprocess

In [None]:
def word_cloud_age(my_str):
	noun_freq_tmp = list(df_age_preprocess.loc[df_age_preprocess["age_group"]==my_str,["location_name","count"]].itertuples(index=False, name=None))
	pd.DataFrame(noun_freq_tmp,columns=["word","freq"]).to_csv(f"data/output_hanlp+worldcloud/wordcloud/corresponding_table/age_{my_str}.csv", index=False)

# 	gen_stylecloud(
# 		text = dict(noun_freq_tmp),                    # {(word1, freq1),(word2, freq2), …}
# 		size=(1500,1500),         # 词云图的长宽，设置更大的数字可以增加成图的分辨率，但代码运行时间会随之增加
# 		max_words=1000,          # 词云图中的最大词语数量
# 		max_font_size=300,      # 词云图中字号的最大值
# 		font_path=r'C:\Windows\Fonts\SimHei.ttf', # 字体是必要的参数，否则中文会显示异常
# 		palette='colorbrewer.diverging.Spectral_11',
# 		# colors=['#ecf0f1', '#3498db', '#e74c3c'],                # 不想用palette可以自定义颜色
# 		output_name=f'data/output_hanlp+worldcloud/wordcloud/age_{my_str}.png',       # 必要参数，保存词云图的路径
# 		background_color='black',
# 		# icon_name= "fas fa-comment-alt" , # comment图标
# 		# icon_name= "fas fa-circle" ,   # 圆形图标
# 		icon_name = "fas fa-square-full",  # 正方形图标
# 		collocations = False,
# 		invert_mask=False,                 # 形状反转（在画布里icon之外的地方绘图）
# 		# custom_stopwords=my_stop_list,
# 		random_state = 42
# 	)

# 	## remove district
# 	gen_stylecloud(
# 		text = dict(noun_freq_tmp),                    # {(word1, freq1),(word2, freq2), …}
# 		size=(1500,1500),         # 词云图的长宽，设置更大的数字可以增加成图的分辨率，但代码运行时间会随之增加
# 		max_words=1000,          # 词云图中的最大词语数量
# 		max_font_size=300,      # 词云图中字号的最大值
# 		font_path=r'C:\Windows\Fonts\SimHei.ttf', # 字体是必要的参数，否则中文会显示异常
# 		# palette='colorbrewer.diverging.Spectral_11',
# 		# colors=['#ecf0f1', '#3498db', '#e74c3c'],                # 不想用palette可以自定义颜色
# 		output_name=f'data/output_hanlp+worldcloud/wordcloud/age_{my_str}_rm_district.png',       # 必要参数，保存词云图的路径
# 		background_color='white',
# 		# icon_name= "fas fa-comment-alt" , # comment图标
# 		# icon_name= "fas fa-circle" ,   # 圆形图标
# 		icon_name = "fas fa-square-full",  # 正方形图标
# 		collocations = False,
# 		invert_mask=False,                 # 形状反转（在画布里icon之外的地方绘图）
# 		custom_stopwords=my_stop_list,
# 		random_state = 42
# 	)


In [None]:
word_cloud_age("leq70s")
word_cloud_age("70s")
word_cloud_age("80s")
word_cloud_age("90s")
word_cloud_age("00s")



## 活动区域（全关内、全关外、关内+关外）

关内-0：福田 罗湖 南山 盐田

关外-1：龙岗 龙华 光明 宝安 坪山

关内+关外-2：

In [None]:
df_area = df_merge_remove_sz.copy(deep=True)
df_area_preprocess = df_area.groupby(["关内关外","location_name"]).agg({"count":"sum"}).reset_index()
df_area_preprocess

In [None]:
def word_cloud_area(my_str):
	noun_freq_tmp = list(df_area_preprocess.loc[df_area_preprocess["关内关外"]==my_str,["location_name","count"]].itertuples(index=False, name=None))
	if my_str == 0:
		my_str = "关内"
	if my_str == 1:
		my_str = "关外"
	if my_str == 2:
		my_str = "关内关外"
  
	pd.DataFrame(noun_freq_tmp,columns=["word","freq"]).to_csv(f"data/output_hanlp+worldcloud/wordcloud/corresponding_table/area_{my_str}.csv", index=False)
	
# 	gen_stylecloud(
# 		text = dict(noun_freq_tmp),                    # {(word1, freq1),(word2, freq2), …}
# 		size=(1500,1500),         # 词云图的长宽，设置更大的数字可以增加成图的分辨率，但代码运行时间会随之增加
# 		max_words=1000,          # 词云图中的最大词语数量
# 		max_font_size=300,      # 词云图中字号的最大值
# 		font_path=r'C:\Windows\Fonts\SimHei.ttf', # 字体是必要的参数，否则中文会显示异常
# 		palette='colorbrewer.diverging.Spectral_11',
# 		# colors=['#ecf0f1', '#3498db', '#e74c3c'],                # 不想用palette可以自定义颜色
# 		output_name=f'data/output_hanlp+worldcloud/wordcloud/area_{my_str}.png',       # 必要参数，保存词云图的路径
# 		background_color='black',
# 		icon_name= "fas fa-comment-alt" , # comment图标
# 		# icon_name= "fas fa-circle" ,   # 圆形图标
# 		# icon_name = "fas fa-square-full",  # 正方形图标
# 		collocations = False,
# 		invert_mask=False,                 # 形状反转（在画布里icon之外的地方绘图）
# 		# custom_stopwords=my_stop_list,
# 		random_state = 42
# 	)

# 	## remove district
# 	gen_stylecloud(
# 		text = dict(noun_freq_tmp),                    # {(word1, freq1),(word2, freq2), …}
# 		size=(1500,1500),         # 词云图的长宽，设置更大的数字可以增加成图的分辨率，但代码运行时间会随之增加
# 		max_words=1000,          # 词云图中的最大词语数量
# 		max_font_size=300,      # 词云图中字号的最大值
# 		font_path=r'C:\Windows\Fonts\SimHei.ttf', # 字体是必要的参数，否则中文会显示异常
# 		# palette='colorbrewer.diverging.Spectral_11',
# 		# colors=['#ecf0f1', '#3498db', '#e74c3c'],                # 不想用palette可以自定义颜色
# 		output_name=f'data/output_hanlp+worldcloud/wordcloud/area_{my_str}_rm_district.png',       # 必要参数，保存词云图的路径
# 		background_color='white',
# 		icon_name= "fas fa-comment-alt" , # comment图标
# 		# icon_name= "fas fa-circle" ,   # 圆形图标
# 		# icon_name = "fas fa-square-full",  # 正方形图标
# 		collocations = False,
# 		invert_mask=False,                 # 形状反转（在画布里icon之外的地方绘图）
# 		custom_stopwords=my_stop_list,
# 		random_state = 42
# 	)


In [None]:
word_cloud_area(0)
word_cloud_area(1)
word_cloud_area(2)
