In [144]:
import pandas as pd
import sklearn
import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [20]:
news = pd.read_csv('../train/sqlResult_1558435.csv', encoding='gb18030')

In [21]:
len(news)

89611

In [22]:
news = news.dropna(subset=['content'])

In [23]:
len(news)

87054

In [24]:
def tokenize(string):
    return ''.join(re.findall('[\w|\d]+', string))

In [25]:
def cutword(string):
    return ' '.join(list(jieba.cut(tokenize(string))))

In [26]:
cutword(tokenize(news.iloc[3].content))

'这是 6 月 18 日 在 葡萄牙 中部 大 佩德罗 冈 地区 拍摄 的 被 森林 大火 烧毁 的 汽车 新华社 记者 张立 云摄'

In [34]:
news_corpus = list(map(cutword, news.content))

## tfidf

In [40]:
vectorizer = TfidfVectorizer(min_df=0.015)

In [41]:
tfidf = vectorizer.fit_transform(news_corpus)

In [42]:
tfidf.shape

(87054, 945)

## split dataset

In [80]:
label = list(map(lambda n: 1 if '新华' in str(n) else 0, news.source))

In [89]:
label_xinhua = np.where(np.array(label) == 1)

In [91]:
len(label_xinhua[0])

78855

In [96]:
X_train, X_test, y_train, y_test = train_test_split(
    tfidf, label, train_size=0.7, random_state=42)



In [None]:
# tfidf data indices

### Naive Bayes

In [133]:
from sklearn.naive_bayes import MultinomialNB

In [134]:
nb_clf = MultinomialNB()

In [135]:
nb_clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [139]:
print(nb_clf.predict(X_test[1:100]))

[1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1
 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1]


In [142]:
score=cross_validate(nb_clf,X=X_train,y=y_train,scoring=('accuracy','precision','recall','f1'),cv=3,return_train_score=True)

In [143]:
score

{'fit_time': array([ 0.05307698,  0.05718112,  0.05344677]),
 'score_time': array([ 0.04988289,  0.04628587,  0.03872514]),
 'test_accuracy': array([ 0.8860828 ,  0.89104963,  0.88514179]),
 'train_accuracy': array([ 0.89021268,  0.88812308,  0.88849231]),
 'test_precision': array([ 0.96387767,  0.96472683,  0.96362377]),
 'train_precision': array([ 0.96506676,  0.96444521,  0.96566759]),
 'test_recall': array([ 0.90822097,  0.91306003,  0.90740539]),
 'train_recall': array([ 0.91175511,  0.90998804,  0.90917247]),
 'test_f1': array([ 0.93522199,  0.93818263,  0.93466999]),
 'train_f1': array([ 0.93765377,  0.93642556,  0.93656884])}

In [145]:
y_predit = nb_clf.predict(X_test)

In [146]:
accuracy_score(y_test,y_predit)

0.89240724432362062

In [150]:
precision_score(y_test,y_predit)

0.96505771716361366

In [151]:
recall_score(y_test,y_predit)

0.91443895434773426

In [152]:
f1_score(y_test,y_predit)

0.93906670136178327

### Linear Regression

In [157]:
from sklearn.linear_model import LinearRegression

In [159]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_train, y_train)

0.5161449775439868

In [161]:
y_reg_predit = reg.predict(X_test)

In [164]:
y_reg_predit

array([ 0.98486381,  0.44317965,  1.18662123, ...,  1.03028364,
        0.992662  ,  1.22976252])

### Logistic Regression

In [165]:
from sklearn.linear_model import LogisticRegression

In [166]:
lr_clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X_train, y_train)

In [170]:
lr_clf.predict(X_train[:2])

array([1, 1])

In [172]:
y_train[:2]

[1, 1]

In [169]:
lr_clf.score(X_train, y_train)

0.98715066380031835

In [176]:
y_lr_predict = lr_clf.predict(X_test)

In [179]:
accuracy_score(y_test,y_lr_predict)

0.98261668644943911

In [180]:
precision_score(y_test,y_lr_predict)

0.98671360912024808

In [181]:
recall_score(y_test,y_lr_predict)

0.99421428269774903

In [182]:
f1_score(y_test,y_lr_predict)

0.99044974546678444

#### 从Score上看 Logistic Regression>Naive Bayes

## 查找相似性文本

In [206]:
all_predict=lr_clf.predict(tfidf)

In [207]:
all_predict

array([0, 0, 0, ..., 1, 1, 1])

In [208]:
suspect_xinhua=[]

In [209]:
for i in range(len(all_predict)):
    if all_predict[i]==1 and label[i]==0:
        suspect_xinhua.append(i)

In [210]:
suspect_xinhua

[16,
 37,
 56,
 61,
 63,
 68,
 70,
 71,
 94,
 96,
 121,
 125,
 135,
 146,
 153,
 160,
 221,
 228,
 235,
 246,
 277,
 280,
 294,
 319,
 340,
 384,
 392,
 404,
 405,
 422,
 423,
 424,
 449,
 458,
 471,
 489,
 524,
 548,
 561,
 593,
 620,
 627,
 631,
 634,
 660,
 665,
 670,
 675,
 680,
 685,
 687,
 718,
 719,
 728,
 741,
 755,
 765,
 773,
 776,
 777,
 780,
 786,
 789,
 795,
 810,
 818,
 821,
 865,
 869,
 914,
 934,
 939,
 953,
 955,
 974,
 978,
 985,
 1018,
 1025,
 1030,
 1036,
 1055,
 1105,
 1116,
 1127,
 1142,
 1155,
 1176,
 1183,
 1184,
 1226,
 1227,
 1228,
 1275,
 1307,
 1325,
 1327,
 1328,
 1346,
 1347,
 1367,
 1376,
 1377,
 1381,
 1389,
 1390,
 1402,
 1407,
 1414,
 1417,
 1439,
 1441,
 1447,
 1457,
 1472,
 1473,
 1476,
 1477,
 1479,
 1482,
 1484,
 1487,
 1499,
 1518,
 1551,
 1552,
 1553,
 1566,
 1570,
 1579,
 1618,
 1630,
 1657,
 1667,
 1684,
 1687,
 1689,
 1711,
 1720,
 1739,
 1747,
 1768,
 1777,
 1794,
 1797,
 1798,
 1799,
 1807,
 1813,
 1815,
 1818,
 1828,
 1862,
 1867,
 1885,
 1

In [198]:
news.iloc[16]

id                                                     89601
author                                                   NaN
source                                              solidot@
content    九成以上的源包用逐位对应的方式构建，未来版本的 Debian 还将提供验证包的工具和元数据；...
feature    {"type":"软件","site":"cnbeta","commentNum":"12"...
title                                    Debian 9 Stretch 发布
url           http://www.cnbeta.com/articles/soft/623605.htm
Name: 16, dtype: object

## Advanced : find the copied part from original text