In [1]:
import numpy as np
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

In [2]:
news = pd.read_csv('sqlResult_1558435.csv', encoding='gb18030')

In [4]:
# 查看数据有无缺失值
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89611 entries, 0 to 89610
Data columns (total 7 columns):
id         89611 non-null int64
author     79396 non-null object
source     89609 non-null object
content    87054 non-null object
feature    89611 non-null object
title      89577 non-null object
url        87144 non-null object
dtypes: int64(1), object(6)
memory usage: 4.8+ MB


In [6]:
# 去除缺失值
news.dropna(inplace=True)

In [7]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76925 entries, 4 to 89610
Data columns (total 7 columns):
id         76925 non-null int64
author     76925 non-null object
source     76925 non-null object
content    76925 non-null object
feature    76925 non-null object
title      76925 non-null object
url        76925 non-null object
dtypes: int64(1), object(6)
memory usage: 4.7+ MB


In [10]:
# 替换 label
news.replace({'source': r'新华社'}, {'source': 1}, regex=True, inplace=True)

In [11]:
news.head(300)

Unnamed: 0,id,author,source,content,feature,title,url
4,89613,胡淑丽_MN7479,深圳大件事,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""978"",...",44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随,http://news.163.com/17/0618/00/CN617P3Q0001875...
5,89612,张怡,中国证券报?中证网,受到A股被纳入MSCI指数的利好消息刺激，A股市场从周三开始再度上演龙马行情，周四上午金...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",金融股一枝独秀 配置价值犹存,http://www.cs.com.cn/gppd/201706/t20170623_533...
7,89610,申玉彬 整理,中国证券报?中证网,沙漠雄鹰：震荡有利消化套牢筹码\r\n 周四开盘上证50在银行券商大蓝筹带动下一度涨近...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",博友早评：震荡有利消化套牢筹码,http://www.cs.com.cn/gppd/201706/t20170623_533...
8,89609,李杭_BJS4645,荆楚网-楚天都市报,（原标题：武汉警方一下子抓了808人，还都是俊男靓女！原来他们每天偷偷摸摸干这事！）\r\n...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""1600""...",武汉千余警察出动 抓获808名俊男靓女全是诈骗犯,http://news.163.com/17/0614/14/CMT9N8G80001899...
9,89608,吴瞬,中国证券报?中证网,6月21日，A股纳入MSCI指数尘埃落定，但当天被寄予厚望的券商股并未扛起反弹大旗。22...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",纳入MSCI指数 A股长期配置价值提升,http://www.cs.com.cn/gppd/201706/t20170623_533...
13,89604,申玉彬 整理,中国证券报?中证网,曹先生：风格转换前的阵痛\r\n 今日早盘两地低开，之后一度震荡走高，领涨的仍然是上证...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",博友午评：风格转换前的阵痛,http://www.cs.com.cn/gppd/201706/t20170623_533...
15,89602,费天元,中国证券网,中国证券网讯（记者 费天元）沪深两市周五临近午盘出现快速跳水，沪指连续跌穿5日、10日及...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",地产股拉升难阻市场颓势 三大指数早盘集体跳水,http://www.cs.com.cn/gppd/201706/t20170623_533...
18,89599,张怡,中国证券报?中证网,中证网讯 （本报记者 张怡）今日，三板做市指数继续低开下行，盘中最低触及1057.91点...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",新三板午评：三板做市指数刷新阶段低点,http://www.cs.com.cn/gppd/201706/t20170623_533...
19,89598,周佳 整理,中国证券报?中证网,提示声明：\r\n 本文涉及的行业及个股分析来源于券商研究报告，仅为分析人士对该行业及...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",机构解析：下周热点板块及个股探秘（附股）,http://www.cs.com.cn/gppd/hyyj/201706/t2017062...
21,89596,周佳 整理,中国证券报?中证网,提示声明：\r\n 本文涉及的行业及个股分析来源于券商研究报告，仅为分析人士对该行业及...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",机构推荐：下周具备布局潜力金股,http://www.cs.com.cn/gppd/tzpj/201706/t2017062...


In [13]:
news.loc[news['source'] != 1, 'source'] = 0

In [14]:
news.head(300)

Unnamed: 0,id,author,source,content,feature,title,url
4,89613,胡淑丽_MN7479,0,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""978"",...",44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随,http://news.163.com/17/0618/00/CN617P3Q0001875...
5,89612,张怡,0,受到A股被纳入MSCI指数的利好消息刺激，A股市场从周三开始再度上演龙马行情，周四上午金...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",金融股一枝独秀 配置价值犹存,http://www.cs.com.cn/gppd/201706/t20170623_533...
7,89610,申玉彬 整理,0,沙漠雄鹰：震荡有利消化套牢筹码\r\n 周四开盘上证50在银行券商大蓝筹带动下一度涨近...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",博友早评：震荡有利消化套牢筹码,http://www.cs.com.cn/gppd/201706/t20170623_533...
8,89609,李杭_BJS4645,0,（原标题：武汉警方一下子抓了808人，还都是俊男靓女！原来他们每天偷偷摸摸干这事！）\r\n...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""1600""...",武汉千余警察出动 抓获808名俊男靓女全是诈骗犯,http://news.163.com/17/0614/14/CMT9N8G80001899...
9,89608,吴瞬,0,6月21日，A股纳入MSCI指数尘埃落定，但当天被寄予厚望的券商股并未扛起反弹大旗。22...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",纳入MSCI指数 A股长期配置价值提升,http://www.cs.com.cn/gppd/201706/t20170623_533...
13,89604,申玉彬 整理,0,曹先生：风格转换前的阵痛\r\n 今日早盘两地低开，之后一度震荡走高，领涨的仍然是上证...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",博友午评：风格转换前的阵痛,http://www.cs.com.cn/gppd/201706/t20170623_533...
15,89602,费天元,0,中国证券网讯（记者 费天元）沪深两市周五临近午盘出现快速跳水，沪指连续跌穿5日、10日及...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",地产股拉升难阻市场颓势 三大指数早盘集体跳水,http://www.cs.com.cn/gppd/201706/t20170623_533...
18,89599,张怡,0,中证网讯 （本报记者 张怡）今日，三板做市指数继续低开下行，盘中最低触及1057.91点...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",新三板午评：三板做市指数刷新阶段低点,http://www.cs.com.cn/gppd/201706/t20170623_533...
19,89598,周佳 整理,0,提示声明：\r\n 本文涉及的行业及个股分析来源于券商研究报告，仅为分析人士对该行业及...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",机构解析：下周热点板块及个股探秘（附股）,http://www.cs.com.cn/gppd/hyyj/201706/t2017062...
21,89596,周佳 整理,0,提示声明：\r\n 本文涉及的行业及个股分析来源于券商研究报告，仅为分析人士对该行业及...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",机构推荐：下周具备布局潜力金股,http://www.cs.com.cn/gppd/tzpj/201706/t2017062...


In [15]:
with open('./stop_words.utf8') as f: 
    stop_words =  [l.strip() for l in f]

In [16]:
def cut(string): return [word for word in list(jieba.cut(string)) if word not in stop_words]

In [17]:
all_contents = news['content']

In [21]:
all_contents.iloc[1]

'\u3000\u3000受到A股被纳入MSCI指数的利好消息刺激，A股市场从周三开始再度上演龙马行情，周四上午金融股和白马股表现喜人，但是尾盘跳水之后，仅金融板块仍维系红盘状态。分析人士认为，金融股受益于MSCI纳入A股和低估值而重获资金青睐，但是存量资金博弈格局下，风格交替的震荡格局料延续。流动性改善、经济悲观预期修正等有助于支撑板块继而大盘指数逐步向好。\r\n\u3000\u3000“一九”再现\r\n\u3000\u3000周四，A股市场未能延续周三的上行态势，两市成交小幅放量。29个中信一级行业中，收盘仅银行和非银行金融两个行业指数收红，分别上涨1.80%和0.20%。\r\n\u3000\u3000从二级行业来看，股份制与城商行的涨幅最高，达到2.22%，国有银行上涨0.82%，信托及其他上涨0.64%，保险板块上涨0.34%，证券板块上涨0.06%。\r\n\u3000\u3000银行板块25只成分股中，共有21只收红。其中，招商银行涨幅最大，上涨6.66%，贵阳银行上涨3.65%，上海银行、华夏银行、浦发银行和兴业银行的涨幅均超过1.50%。非银行金融板块44只成分股中，共17只个股上涨。其中，安信信托、中国太保涨幅居前两名，分别上涨4.57%和3.04%，西水股份、华安证券、中国人寿和新华保险的涨幅也均超过2%。相对而言，券商股多小幅下跌。\r\n\u3000\u3000近期，对A股市场消息面影响最大的就是MSCI宣布从2018年6月开始将A股纳入MSCI新兴市场指数。而其中，金融股是占比最大的一个群体。国金证券李立峰团队指出，最新方案中包含的222只成分股中，剔除了中等市值、非互联互通可交易的股票以及有停牌限制的标的，由于纳入了很多大市值AH股，A股在MSCI EM中的权重由0.5%上升到了0.73%。其中，金融板块占比最高，达到40.11%，泛消费次之，占比为24.26%，两个板块涵盖了大部分权重股。动态来看，由于加入了很多是指占比高的金融公司，金融板块的权重增加了近一半，其他大部分行业权重都受到了稀释。\r\n\u3000\u3000尽管A股被纳入MSCI这一利好事件对短期市场情绪有所提振、对中长期海外增量资金预期升温，但短期内，市场量能尚不能有效放大，金融股独乐乐情景也就难以持续。存量博弈格局下，风格交替、指数震荡格局难改变。\r\n\u

In [22]:
all_contents = [' '.join(cut(s)) for s in all_contents]

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/_8/f14fxxnn7w13cd0l5x9hl4bm0000gn/T/jieba.cache
Loading model cost 0.839 seconds.
Prefix dict has been built succesfully.


In [23]:
all_contents[1]

'\u3000 \u3000 A股 纳入 MSCI 指数 利好 消息 刺激 A股 市场 周三 再度 上演 龙马 行情 周四 上午 金融股 白马股 表现 喜人 尾盘 跳水 之后 仅 金融 板块 维系 红盘 状态 分析 人士 认为 金融股 受益 MSCI 纳入 A股 低 估值 重获 资金 青睐 存量 资金 博弈 格局 风格 交替 震荡 格局 料 延续 流动性 改善 经济 悲观 预期 修正 有助于 支撑 板块 大盘 指数 \r\n \u3000 \u3000 九 再现 \r\n \u3000 \u3000 周四 A股 市场 未能 延续 周三 上行 态势 两市 成交 小幅 放量 29 中信 一级 行业 中 收盘 仅 银行 非银行 金融 两个 行业 指数 收红 上涨 1.80% 0.20% \r\n \u3000 \u3000 二级 行业 来看 股份制 城商行 涨幅 最高 达到 2.22% 国有银行 上涨 0.82% 信托 上涨 0.64% 保险 板块 上涨 0.34% 证券 板块 上涨 0.06% \r\n \u3000 \u3000 银行 板块 25 成分股 中 共有 21 收红 招商银行 涨幅 最大 上涨 6.66% 贵阳 银行 上涨 3.65% 上海银行 华夏银行 浦发银行 兴业银行 涨幅 均 超过 1.50% 非银行 金融 板块 44 成分股 中 共 17 个股 上涨 安信 信托 中国 太保 涨幅 居前 两名 上涨 4.57% 3.04% 西水股份 华安 证券 中国 人寿 新华 保险 涨幅 均 超过 2% 券商 股多 小幅 下跌 \r\n \u3000 \u3000 近期 A股 市场 消息面 影响 最大 MSCI 宣布 2018 年 月 A股 纳入 MSCI 新兴 市场 指数 金融股 占 最大 一个 群体 国金 证券 李立峰 团队 指出 最新 方案 中 包含 222 成分股 中 剔除 中等 市值 非 互联互通 交易 股票 停牌 限制 标的 纳入 很多 市值 A H股 A股 MSCI   EM 中 权重 0.5% 上升 0.73% 金融 板块 占 最高 达到 40.11% 泛 消费 次之 占 24.26% 两个 板块 涵盖 大部分 权重股 动态 来看 加入 很多 指 占 比高 金融公司 金融 板块 权重 增加 近一半 大部分 行业 权重 稀释 \r\n \u3000 

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
vectorizer = TfidfVectorizer(max_features=10000)

In [30]:
X = vectorizer.fit_transform(all_contents)

In [31]:
X.shape

(76925, 10000)

In [32]:
y = pd.to_numeric(news['source']).values

In [33]:
y

array([0, 0, 0, ..., 1, 1, 1])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [35]:
clf = RandomForestClassifier(min_samples_split=100, min_samples_leaf=20, max_depth=8, max_features='sqrt', 
                             random_state=0)

In [36]:
# 调参决策树数量
param_grid = [{'n_estimators':range(10,71,10)}]

In [37]:
search1 = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)

In [38]:
search1.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   27.6s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'n_estimators': range(10, 71, 10)}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [42]:
search1.best_params_

{'n_estimators': 60}

In [43]:
search1.best_score_

0.994253858026569

In [44]:
# 调参树最大深度和结点最小样本量可分割
param_grid = [{'max_depth':range(3,14,2), 'min_samples_split':range(50,201,20)}]

In [45]:
clf = RandomForestClassifier(n_estimators=60, min_samples_leaf=20, max_features='sqrt', 
                             random_state=0)

In [46]:
search2 = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)

In [47]:
search2.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  4.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'max_depth': range(3, 14, 2), 'min_samples_split': range(50, 201, 20)}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [48]:
search2.best_params_

{'max_depth': 13, 'min_samples_split': 90}

In [49]:
search2.best_score_

0.9960169786102593

In [51]:
# 调参结点最小样本量可分割和叶子结点最小样本量
param_grid = [{'min_samples_split':range(80,150,20), 'min_samples_leaf':range(10,60,10)}]

In [52]:
clf = RandomForestClassifier(n_estimators=60,max_depth=13, max_features='sqrt', 
                             random_state=0)

In [54]:
search3 = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)

In [55]:
search3.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'min_samples_split': range(80, 150, 20), 'min_samples_leaf': range(10, 60, 10)}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [56]:
search3.best_params_

{'min_samples_leaf': 10, 'min_samples_split': 100}

In [57]:
search3.best_score_

0.9961851630728471

In [58]:
# 调参分割时考虑特征总个数
param_grid = [{'max_features': range(3,11,2)}]

In [59]:
clf = RandomForestClassifier(n_estimators=60,max_depth=13, min_samples_split=100, min_samples_leaf=10, 
                             random_state=0)

In [61]:
search4 = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)

In [62]:
search4.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   14.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   14.2s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'max_features': range(3, 11, 2)}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [63]:
search4.best_params_

{'max_features': 7}

In [64]:
search4.best_score_

0.961386818662112

In [70]:
clf = RandomForestClassifier(n_estimators=60,max_depth=13, min_samples_split=100, min_samples_leaf=10, max_features=7, 
                             random_state=0)

In [71]:
scores = cross_val_score(clf, X_train, y_train,cv=5,scoring='roc_auc')
print (scores.mean(), scores)

0.9613869673901343 [0.95738269 0.95847665 0.9656222  0.95925216 0.96620114]


In [74]:
scores = cross_val_score(clf, X_test, y_test,cv=5,scoring='accuracy')
print (scores.mean(), scores)

0.9693214425869977 [0.96923744 0.96923744 0.96923744 0.96944745 0.96944745]
