In [1]:
# -*- coding: utf-8 -*-

import os
import pandas as pd
from datetime import datetime
from utils import news_crawler
from utils import preprocessing
from utils import modeling
from utils import counter
from collections import Counter

In [2]:
__file__ = r'E:\Projects\SpyderProjects\HotNewsAnalysis\main.ipynb'
# 获取项目路径
project_path = os.path.dirname(os.path.realpath(__file__))
# 获取数据存放目录路径
data_path = os.path.join(project_path, 'data')
news_path = os.path.join(data_path, 'news')
extra_dict_path = os.path.join(data_path, 'extra_dict')
fonts_path = os.path.join(data_path, 'fonts')
results_path = os.path.join(data_path, 'results')

In [3]:
# sina_news_df = news_crawler.get_latest_news('sina', top=1000, show_content=True)
# sohu_news_df = news_crawler.get_latest_news('sohu', top=1000, show_content=True)
# xinhuanet_news_df = news_crawler.get_latest_news('xinhuanet', top=100, show_content=True)
# news_crawler.save_news(sina_news_df, os.path.join(news_path, 'sina_latest_news.csv'))
# news_crawler.save_news(sohu_news_df, os.path.join(news_path, 'sohu_latest_news.csv'))
# news_crawler.save_news(xinhuanet_news_df, os.path.join(news_path, 'xinhuanet_latest_news.csv'))
# news_crawler.threaded_crawler()

In [4]:
sina_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_sina_latest_news.csv'))
sohu_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_sohu_latest_news.csv'))
xinhuanet_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_xinhuanet_latest_news.csv'))
news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True)

In [5]:
df = preprocessing.data_filter(news_df)
# now_time = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
now_time = '2018-04-06 23:59'
df = preprocessing.get_data(df, last_time=now_time, delta=5)

df.shape= (2619, 4)


# 新闻标题聚类

In [6]:
df_title = df.copy()
df_title['title_'] = df_title['title'].map(lambda x: preprocessing.clean_title(x))
df_title['title_'] = df_title['title_'].map(lambda x: preprocessing.get_num_en_ch(x))
df_title['title_cut'] = df_title['title_'].map(lambda x: preprocessing.pseg_cut(
    x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
df_title['title_cut'] = df_title['title_cut'].map(lambda x: preprocessing.get_words_by_flags(
    x, flags=['n.*', '.*n', 'v.*', 's', 'j', 'l', 'i', 'eng']))
df_title['title_cut'] = df_title['title_cut'].map(lambda x: preprocessing.stop_words_cut(
    x, os.path.join(extra_dict_path, 'HIT_stop_words.txt')))
df_title['title_cut'] = df_title['title_cut'].map(lambda x: preprocessing.stop_words_cut(
    x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
df_title['title_cut'] = df_title['title_cut'].map(lambda x: preprocessing.disambiguation_cut(
    x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))
df_title['title_cut'] = df_title['title_cut'].map(lambda x: preprocessing.individual_character_cut(
    x, os.path.join(extra_dict_path, 'self_individual_character_dict.txt')))
df_title['title_'] = df_title['title_cut'].map(lambda x: ' '.join(x))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Jacen\AppData\Local\Temp\jieba.cache
Loading model cost 1.123 seconds.
Prefix dict has been built succesfully.


In [7]:
word_library_list = counter.get_word_library(df_title['title_cut'])
single_frequency_words_list = counter.get_single_frequency_words(df_title['title_cut'])

In [8]:
max_features = len(word_library_list) - len(single_frequency_words_list) // 2
max_features

3844

In [9]:
title_matrix = modeling.feature_extraction(df_title['title_'], vectorizer='CountVectorizer',
                                           vec_args={'max_df': 1.0, 'min_df': 1, 'max_features': max_features})
title_matrix.shape

(2619, 3844)

In [10]:
title_dbscan = modeling.get_cluster(title_matrix, cluster='DBSCAN',
                                    cluster_args={'eps': 0.4, 'min_samples': 4, 'metric': 'cosine'})

In [11]:
title_labels = modeling.get_labels(title_dbscan)
df_title['title_label'] = title_labels
title_rank = modeling.label2rank(title_labels)
df_title['title_rank'] = title_rank
Counter(title_labels)

Counter({-1: 2341,
         0: 150,
         1: 7,
         2: 8,
         3: 4,
         4: 4,
         5: 4,
         6: 8,
         7: 4,
         8: 4,
         9: 8,
         10: 5,
         11: 4,
         12: 4,
         13: 14,
         14: 4,
         15: 5,
         16: 4,
         17: 4,
         18: 5,
         19: 5,
         20: 4,
         21: 5,
         22: 6,
         23: 4,
         24: 4})

In [12]:
title_label_num = counter.get_num_of_value_no_repeat(title_labels)
title_label_num

26

In [13]:
df_ = df_title[df_title['title_rank'] == 1]
title_top_list = counter.get_most_common_words(df_['title_cut'], top_n=10)
# print(df_)
print(title_top_list)

['美国', '中国', '关税', '加征', '特朗普', '贸易战', '商品', '对华', '产品', '建议']


# 新闻内容聚类

In [14]:
df_content = df.copy()
df_content['content_'] = df_content['content'].map(lambda x: preprocessing.clean_content(x))
df_content['content_'] = df_content['content_'].map(lambda x: preprocessing.get_num_en_ch(x))
df_content['content_cut'] = df_content['content_'].map(lambda x: preprocessing.pseg_cut(
    x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
df_content['content_cut'] = df_content['content_cut'].map(lambda x: preprocessing.get_words_by_flags(
    x, flags=['n.*', '.*n', 'v.*', 's', 'j', 'l', 'i', 'eng']))
df_content['content_cut'] = df_content['content_cut'].map(lambda x: preprocessing.stop_words_cut(
    x, os.path.join(extra_dict_path, 'HIT_stop_words.txt')))
df_content['content_cut'] = df_content['content_cut'].map(lambda x: preprocessing.stop_words_cut(
    x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
df_content['content_cut'] = df_content['content_cut'].map(lambda x: preprocessing.disambiguation_cut(
    x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))
df_content['content_cut'] = df_content['content_cut'].map(lambda x: preprocessing.individual_character_cut(
    x, os.path.join(extra_dict_path, 'self_individual_character_dict.txt')))
df_content['content_'] = df_content['content_cut'].map(lambda x: ' '.join(x))

In [15]:
word_library_list = counter.get_word_library(df_content['content_cut'])
single_frequency_words_list = counter.get_single_frequency_words(df_content['content_cut'])

In [16]:
max_features = len(word_library_list) - len(single_frequency_words_list) // 2
print(len(word_library_list), len(single_frequency_words_list), max_features)

49163 19390 39468


In [17]:
content_matrix = modeling.feature_extraction(df_content['content_'], vectorizer='CountVectorizer',
                                             vec_args={'max_df': 0.95, 'min_df': 1, 'max_features': max_features})
content_matrix.shape

(2619, 39468)

In [18]:
content_dbscan = modeling.get_cluster(content_matrix, cluster='DBSCAN',
                                      cluster_args={'eps': 0.35, 'min_samples': 4, 'metric': 'cosine'})

In [19]:
content_labels = modeling.get_labels(content_dbscan)
df_content['content_label'] = content_labels
content_rank = modeling.label2rank(content_labels)
df_content['content_rank'] = content_rank
Counter(content_labels)

Counter({-1: 1872,
         0: 385,
         1: 10,
         2: 30,
         3: 11,
         4: 15,
         5: 4,
         6: 13,
         7: 7,
         8: 17,
         9: 43,
         10: 5,
         11: 5,
         12: 4,
         13: 9,
         14: 4,
         15: 4,
         16: 5,
         17: 6,
         18: 12,
         19: 10,
         20: 4,
         21: 6,
         22: 8,
         23: 4,
         24: 4,
         25: 4,
         26: 10,
         27: 4,
         28: 10,
         29: 4,
         30: 4,
         31: 4,
         32: 5,
         33: 4,
         34: 4,
         35: 8,
         36: 9,
         37: 4,
         38: 5,
         39: 7,
         40: 4,
         41: 5,
         42: 4,
         43: 5,
         44: 5,
         45: 4,
         46: 5,
         47: 4})

In [20]:
df_ = df_content[df_content['content_rank'] == 2]
content_top_list = counter.get_most_common_words(df_['content_cut'], top_n=15, min_frequency=1)
# print(df_)
print(content_top_list)

['企业', '市场', '上市', 'cdr', 'a股', '发行', '独角兽', '试点', '创新', '经济', '估值', '存托', '公司', '凭证', '境内']


# 综合分析

In [21]:
# df_title = news_crawler.load_news(os.path.join(results_path, 'df_title_rank.csv'))
# df_content = news_crawler.load_news(os.path.join(results_path, 'df_content_rank.csv'))
# df_title['title_cut'] = df_title['title_cut'].map(eval)
# df_content['content_cut'] = df_content['content_cut'].map(eval)

In [22]:
df_title_content = df_title.copy()
df_title_content['content_cut'] = df_content['content_cut']
df_title_content['content_rank'] = df_content['content_rank']
df_title_content = modeling.get_non_outliers_data(df_title_content, label_column='title_rank')
title_rank_num = counter.get_num_of_value_no_repeat((df_title_content['title_rank']))
title_rank_num

25

In [24]:
for i in range(1, title_rank_num + 1):
    df_i = df_title_content[df_title_content['title_rank'] == i]
    title = '\n'.join(df_i['title'].tolist())
    title = modeling.get_key_sentences(title, num=1)
    print('热点：', title, '\n')
    content_rank = [k for k in df_i['content_rank']]
    content_rank = set(content_rank)
    for j in content_rank:
        df_j = df_i[df_i['content_rank'] == j]
        most_commmon_words = counter.get_most_common_words(df_j['content_cut'], top_n=20, min_frequency=5)
        if len(most_commmon_words) > 0:
            print('相关词汇：', most_commmon_words)
    print('-' * 100)

热点： 中美贸易战再升级：特朗普考虑对1000亿美元中国商品加征关税 

相关词汇： ['美国', '中国', '关税', '贸易', '产品', '加征', '特朗普', '措施', '中美', '经济', '贸易战', '表示', '商品', '出口', '进口', '制造', '大豆', '国家', '公布', '调查']
相关词汇： ['美国', '用于', '超过', '中国', '产品', '包括', '机械', '合金钢', 'mm', '部件', '设备', '关税', '加工', '机器', '金属', '仪器', '装置', '附件', '发动机', '猪肉']
----------------------------------------------------------------------------------------------------
热点： 钜盛华拟清算持股万科资管计划 稳定股价成首要考虑 

相关词汇： ['万科', '计划', '资管', '钜盛华', '股份', '持有', '宝能', '管理', '持股', '相关', '市场', '减持', '清算', '转让', '交易', '处置', '股东', '资产', '协议', '股价']
----------------------------------------------------------------------------------------------------
热点： 中央财经委首提结构性去杠杆 一行两会准备怎么干 

相关词汇： ['杠杆', '攻坚战', '去杠杆', '打好', '政府', '金融风险', '中央财经委员会', '发展', '地方', '风险', '会议', '防范', '问题', '部门', '结构性', '化解', '工作', '坚持', '经济', '提出']
相关词汇： ['杠杆', '去杠杆', '企业', '结构性', '政府', '债务', '会议', '地方', '提出', '中央财经委员会', '国有企业', '金融', '部门', '上升', '工作', '中国', '经济', '负债', '融资', '表示']
-----------------------------------------