In [1]:
# -*- coding: utf-8 -*-

import os
import pandas as pd
from datetime import datetime
from utils import news_crawler
from utils import preprocessing
from utils import modeling

In [2]:
__file__ = r'/home/jacen/PycharmProjects/HotNewsAnalysis/main.ipynb'
# 获取项目路径
project_path = os.path.dirname(os.path.realpath(__file__))
# 获取数据存放目录路径
data_path = os.path.join(project_path, 'data')
news_path = os.path.join(data_path, 'news')
extra_dict_path = os.path.join(data_path, 'extra_dict')
results_path = os.path.join(data_path, 'results')

In [3]:
# sina_news_df = news_crawler.get_latest_news('sina', top=1000, show_content=True)
# sohu_news_df = news_crawler.get_latest_news('sohu', top=1000, show_content=True)
# xinhuanet_news_df = news_crawler.get_latest_news('xinhuanet', top=100, show_content=True)
# news_crawler.save_news(sina_news_df, os.path.join(news_path, 'sina_latest_news.csv'))
# news_crawler.save_news(sohu_news_df, os.path.join(news_path, 'sohu_latest_news.csv'))
# news_crawler.save_news(xinhuanet_news_df, os.path.join(news_path, 'xinhuanet_latest_news.csv'))

In [4]:
sina_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_sina_latest_news.csv'))
sohu_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_sohu_latest_news.csv'))
xinhuanet_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_xinhuanet_latest_news.csv'))
news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True)

In [5]:
df = preprocessing.data_filter(news_df)
# now_time = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
now_time = '2018-04-06 23:59'
df = preprocessing.get_data(df, last_time=now_time, delta=5)

In [6]:
df['title1'] = df['title'].map(lambda x: preprocessing.clean_title(x))

In [7]:
df['title1'] = df['title1'].map(lambda x: preprocessing.get_num_eng_ch(x))
df['title1'] = df['title1'].map(lambda x: preprocessing.pseg_cut(
    x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
df['title_cut'] = df['title1'].map(lambda x: preprocessing.get_words_by_flags(
    x, flags=['n.*', '.*n', 'v.*', 't', 's', 'j', 'l', 'i', 'eng']))
df['title_cut'] = df['title_cut'].map(lambda x: preprocessing.stop_words_cut(
    x, os.path.join(extra_dict_path, 'HIT_stop_words.txt')))
df['title_cut'] = df['title_cut'].map(lambda x: preprocessing.stop_words_cut(
    x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
df['title_cut'] = df['title_cut'].map(lambda x: preprocessing.disambiguation_cut(
    x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.847 seconds.
Prefix dict has been built succesfully.


In [8]:
matrix = modeling.feature_extraction(df['title_cut'], vectorizer='CountVectorizer',
                                     vec_args={'max_df': 1.0, 'min_df': 1})

In [9]:
dbscan = modeling.get_cluster(matrix,cluster='DBSCAN',cluster_args={'eps': 0.5, 'min_samples': 5, 'metric': 'cosine'})

In [10]:
labels = modeling.get_labels(dbscan)
df['title_label'] = labels
title_rank = modeling.label2rank(labels)
df['title_rank'] = title_rank
from collections import Counter
Counter(labels)

Counter({-1: 2204,
         0: 269,
         1: 8,
         2: 12,
         3: 5,
         4: 10,
         5: 8,
         6: 13,
         7: 5,
         8: 7,
         9: 12,
         10: 8,
         11: 16,
         12: 8,
         13: 9,
         14: 5,
         15: 8,
         16: 5,
         17: 7})

In [11]:
labelnum = modeling.get_labelnum(labels)
labelnum

19

In [12]:
# df_label = modeling.get_non_outliers_data(df,label_column='title_label')
# df_1 = modeling.get_data_sort_labelnum(df_label,label_column='title_label', top=4)
df1 = df[df['title_rank'] == 1]
top_list = modeling.get_most_common(df1['title_cut'], n=15)
# print(df1)
# print(top_list)

In [13]:
df['content1'] = df['content'].map(lambda x: preprocessing.clean_content(x))
df['content1'] = df['content1'].map(lambda x: preprocessing.get_num_eng_ch(x))
df['content1'] = df['content1'].map(lambda x: preprocessing.pseg_cut(
    x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
df['content_cut'] = df['content1'].map(lambda x: preprocessing.get_words_by_flags(
    x, flags=['n.*', '.*n', 'v.*', 't', 's', 'j', 'l', 'i', 'eng']))
df['content_cut'] = df['content_cut'].map(lambda x: preprocessing.stop_words_cut(
    x, os.path.join(extra_dict_path, 'HIT_stop_words.txt')))
df['content_cut'] = df['content_cut'].map(lambda x: preprocessing.stop_words_cut(
    x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
df['content_cut'] = df['content_cut'].map(lambda x: preprocessing.disambiguation_cut(
    x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))

In [14]:
matrix = modeling.feature_extraction(df['content_cut'],vectorizer='CountVectorizer',
                                     vec_args={'max_df': 0.95, 'min_df': 1, 'max_features': None})
dbscan = modeling.get_cluster(matrix, cluster='DBSCAN',
                              cluster_args={'eps': 0.5, 'min_samples': 5, 'metric': 'cosine'})

In [15]:
labels = modeling.get_labels(dbscan)
df['content_label'] = labels
content_rank = modeling.label2rank(labels)
df['content_rank'] = content_rank
from collections import Counter
Counter(labels)

Counter({-1: 1333,
         0: 673,
         1: 16,
         2: 12,
         3: 34,
         4: 59,
         5: 10,
         6: 118,
         7: 13,
         8: 113,
         9: 5,
         10: 11,
         11: 13,
         12: 24,
         13: 7,
         14: 43,
         15: 7,
         16: 5,
         17: 10,
         18: 6,
         19: 5,
         20: 5,
         21: 27,
         22: 8,
         23: 8,
         24: 5,
         25: 10,
         26: 5,
         27: 6,
         28: 6,
         29: 5,
         30: 5,
         31: 7,
         32: 5})

In [18]:
df1 = df[df['content_rank'] == 1]
top_list = modeling.get_most_common(df1['content_cut'], n=15)
print(df1)
print(top_list)

                                                  title              time  \
1                                          靠“不可理喻”赢不了未来  2018-04-06 23:54   
3                            【美国3月非农数据点评】非农虽不及预期，难改6月加息  2018-04-06 23:50   
5                                 特朗普警告美国投资者：贸易对峙长远看有好处  2018-04-06 23:44   
6                                涨！涨！涨！征税尚未落地，饲料涨价却已先行！  2018-04-06 23:42   
7                          今晚！两颗重磅核弹引爆避险高潮 美元急坠黄金暴拉14美元  2018-04-06 23:40   
8                                还有两大重磅 英镑/美元飚至1.4100静候  2018-04-06 23:40   
9                              非农远低于预期美元受挫 黄金上涨重返1330一线  2018-04-06 23:39   
13                                对华贸易为什么逆差——美国贸易政策长期扭曲  2018-04-06 23:23   
14                             抛售美债赢得贸易战？少量减持或迫使特朗普改变政策  2018-04-06 23:22   
15                            “再征一千亿关税”惊呆美国人：总统是不是“胡扯”？  2018-04-06 23:21   
16                            意大利评论：特朗普太荒谬，中国很给力，我们等着商机  2018-04-06 23:20   
17                              商务部回应美方新增征税清单：不惜代价 坚决回击  2018-04-06 23:17   