In [144]:
import pandas as pd
import sklearn
import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [20]:
news = pd.read_csv('../train/sqlResult_1558435.csv', encoding='gb18030')

In [21]:
len(news)

89611

In [22]:
news = news.dropna(subset=['content'])

In [23]:
len(news)

87054

In [24]:
def tokenize(string):
    return ''.join(re.findall('[\w|\d]+', string))

In [25]:
def cutword(string):
    return ' '.join(list(jieba.cut(tokenize(string))))

In [26]:
cutword(tokenize(news.iloc[3].content))

'这是 6 月 18 日 在 葡萄牙 中部 大 佩德罗 冈 地区 拍摄 的 被 森林 大火 烧毁 的 汽车 新华社 记者 张立 云摄'

In [34]:
news_corpus = list(map(cutword, news.content))

## tfidf

In [40]:
vectorizer = TfidfVectorizer(min_df=0.015)

In [41]:
tfidf = vectorizer.fit_transform(news_corpus)

In [42]:
tfidf.shape

(87054, 945)

## split dataset

In [80]:
label = list(map(lambda n: 1 if '新华' in str(n) else 0, news.source))

In [89]:
label_xinhua = np.where(np.array(label) == 1)

In [91]:
len(label_xinhua[0])

78855

In [96]:
X_train, X_test, y_train, y_test = train_test_split(
    tfidf, label, train_size=0.7, random_state=42)



In [None]:
# tfidf data indices

In [211]:
def get_model_score(y, y_pred):
    return accuracy_score(y, y_pred), precision_score(y, y_pred), recall_score(y, y_pred), f1_score(y, y_pred)

### Naive Bayes

In [133]:
from sklearn.naive_bayes import MultinomialNB

In [134]:
nb_clf = MultinomialNB()

In [135]:
nb_clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [139]:
print(nb_clf.predict(X_test[1:100]))

[1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1
 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1]


In [145]:
y_predit = nb_clf.predict(X_test)

In [212]:
print(get_model_score(y_test,y_predit))

(0.89240724432362062, 0.96505771716361366, 0.91443895434773426, 0.93906670136178327)


In [146]:
accuracy_score(y_test,y_predit)

0.89240724432362062

In [150]:
precision_score(y_test,y_predit)

0.96505771716361366

In [151]:
recall_score(y_test,y_predit)

0.91443895434773426

In [152]:
f1_score(y_test,y_predit)

0.93906670136178327

### Linear Regression

In [157]:
from sklearn.linear_model import LinearRegression

In [159]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_train, y_train)

0.5161449775439868

In [161]:
y_reg_predit = reg.predict(X_test)

In [164]:
y_reg_predit

array([ 0.98486381,  0.44317965,  1.18662123, ...,  1.03028364,
        0.992662  ,  1.22976252])

### Logistic Regression

In [165]:
from sklearn.linear_model import LogisticRegression

In [166]:
lr_clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X_train, y_train)

In [170]:
lr_clf.predict(X_train[:2])

array([1, 1])

In [172]:
y_train[:2]

[1, 1]

In [169]:
lr_clf.score(X_train, y_train)

0.98715066380031835

In [176]:
y_lr_predict = lr_clf.predict(X_test)

In [179]:
accuracy_score(y_test,y_lr_predict)

0.98261668644943911

In [180]:
precision_score(y_test,y_lr_predict)

0.98671360912024808

In [181]:
recall_score(y_test,y_lr_predict)

0.99421428269774903

In [182]:
f1_score(y_test,y_lr_predict)

0.99044974546678444

In [214]:
get_model_score(y_test,y_lr_predict)

(0.98261668644943911,
 0.98671360912024808,
 0.99421428269774903,
 0.99044974546678444)

### Decision Tree

In [215]:
from sklearn import tree

In [216]:
tree_clf=tree.DecisionTreeClassifier()

In [217]:
tree_clf=tree_clf.fit(X_train,y_train)

In [218]:
y_tree_predit=tree_clf.predict(X_test)

In [219]:
get_model_score(y_test,y_tree_predit)

(0.99127005398782397,
 0.99543668399036633,
 0.99493221842138602,
 0.99518438727664427)

### XGBoost

In [224]:
import xgboost

In [225]:
xgb_model = xgboost.XGBClassifier(objective="binary:logistic",random_state=42)

In [226]:
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [228]:
y_xgb_predit=xgb_model.predict(X_test)

In [229]:
get_model_score(y_test,y_xgb_predit)

(0.99268675575295784,
 0.99577845322526171,
 0.99615693230288438,
 0.995967656807482)

#### 从Score上看 XGBoost>Decision Tress>Logistic Regression>Naive Bayes

## 查找相似性文本

In [230]:
all_predict=xgb_model.predict(tfidf)

In [231]:
all_predict

array([0, 0, 0, ..., 1, 1, 1])

In [232]:
suspect_xinhua=[]

In [233]:
for i in range(len(all_predict)):
    if all_predict[i]==1 and label[i]==0:
        suspect_xinhua.append(i)

In [235]:
len(suspect_xinhua)

290

In [237]:
suspect_xinhua

[293,
 423,
 471,
 514,
 521,
 530,
 598,
 634,
 680,
 687,
 784,
 818,
 939,
 1030,
 1071,
 1084,
 1101,
 1117,
 1176,
 1183,
 1208,
 1217,
 1349,
 1527,
 1553,
 1607,
 1702,
 1813,
 1856,
 1862,
 1890,
 1954,
 1977,
 2373,
 2626,
 2901,
 2918,
 2921,
 2969,
 2974,
 2976,
 2979,
 3025,
 3029,
 3130,
 3146,
 3222,
 3531,
 3641,
 3710,
 3886,
 4040,
 4099,
 4180,
 4185,
 4203,
 4379,
 4386,
 4569,
 4604,
 4690,
 4691,
 4697,
 4716,
 4815,
 4817,
 4985,
 4990,
 5078,
 5112,
 5164,
 5172,
 5176,
 5179,
 5241,
 5245,
 5249,
 5298,
 5337,
 5354,
 5369,
 5374,
 5406,
 5411,
 5414,
 5423,
 5436,
 5621,
 5633,
 5651,
 5663,
 5665,
 5670,
 5676,
 5685,
 5686,
 5709,
 5727,
 5732,
 5794,
 5809,
 5829,
 5837,
 5859,
 5868,
 5869,
 5876,
 5885,
 5892,
 5901,
 5951,
 5998,
 6041,
 6057,
 6071,
 6132,
 6236,
 6257,
 6264,
 6272,
 6277,
 6279,
 6280,
 6284,
 6285,
 6286,
 6292,
 6293,
 6295,
 6298,
 6301,
 6305,
 6308,
 6312,
 6313,
 6317,
 6320,
 6325,
 6326,
 6331,
 6333,
 6339,
 6388,
 6396,
 6400

In [239]:
news.iloc[293]

id                                                     89322
author                                                新华社机器人
source                                             中国证券报?中证网
content    　　看盘：6月23日上证指数午间收报下跌0.74%\r\n　　中证网讯 6月23日, 上证指...
feature    {"type":"数据资金","site":"中证网","commentNum":"0","...
title                                  机器人半天收盘播报（2017年6月23日）
url        http://www.cs.com.cn/gppd/sjjj/201706/t2017062...
Name: 295, dtype: object

## Advanced : find the copied part from original text