In [18]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm
import re
import jieba

from sklearn.feature_extraction.text import TfidfVectorizer

# 数据预处理

##### loading data

In [2]:
csv_path = os.path.abspath('../data/sqlResult_1558435.csv')
print(csv_path)
news = pd.read_csv(csv_path,encoding='gb18030')

/Users/zhengtianyu/Documents/Cris-Mac/EntryTest/Lecuter/data/sqlResult_1558435.csv


In [3]:
print(len(news))
news.head(2)

89611


Unnamed: 0,id,author,source,content,feature,title,url
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm


##### drop NaN

In [4]:
news_dropna = news.dropna(subset=['source','content'])
print('删除了{}行,还剩{}行samples'.format(len(news)-len(news_dropna),
                                   len(news_dropna)))

删除了2559行,还剩87052行samples


##### features selecting

In [5]:
data = news_dropna.loc[:,['content','source']]
data.loc[7:11,:]

Unnamed: 0,content,source
7,沙漠雄鹰：震荡有利消化套牢筹码\r\n 周四开盘上证50在银行券商大蓝筹带动下一度涨近...,中国证券报?中证网
8,（原标题：武汉警方一下子抓了808人，还都是俊男靓女！原来他们每天偷偷摸摸干这事！）\r\n...,荆楚网-楚天都市报
9,6月21日，A股纳入MSCI指数尘埃落定，但当天被寄予厚望的券商股并未扛起反弹大旗。22...,中国证券报?中证网
10,证券时报网（www.stcn.com）06月23日讯\r\n 据上证报道，6月初以来，...,证券时报网
11,?\r\n 巨丰早评：市场将再次探底\r\n 【巨丰观点】\r\n 周四大盘冲高回落，...,证券时报网


##### label encoding

In [6]:
# explore data firstly----(n_positive:新华社新闻samples数，反之)

n_positive = data[data['source']=='新华社']['source'].value_counts().values[0]

series_neg = data[data['source']!='新华社']['source'].value_counts()
n_negative = sum(series_neg.values)

print('新华社文章占比{}%'.format(n_positive/len(data)*100))
print('非新华社文章占比{}%'.format(n_negative/len(data)*100))

新华社文章占比90.36093369480311%
非新华社文章占比9.639066305196895%


In [7]:
# add a column named 'ori_source' to pandasdataframe in a bit to detect specific real source
data.loc[:,'ori_source'] = data['source'].values.tolist()
data[:5]

Unnamed: 0,content,source,ori_source
0,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,快科技@http://www.kkj.cn/,快科技@http://www.kkj.cn/
1,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,快科技@http://www.kkj.cn/,快科技@http://www.kkj.cn/
2,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...,快科技@http://www.kkj.cn/,快科技@http://www.kkj.cn/
3,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n,新华社,新华社
4,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,深圳大件事,深圳大件事


In [8]:
# let the article which source is '新华社' be the value 1.

pos_index = data[data.loc[:,'source']=='新华社'].index.tolist()
neg_index = data[data.loc[:,'source']!='新华社'].index.tolist()
data.loc[pos_index,'source'] = 1
data.loc[neg_index,'source'] = 0

# A value is trying to be set on a copy of a slice from a DataFrame.
# Try using .loc[row_indexer,col_indexer] = value instead
# (.loc[]会覆盖原始数据)


In [9]:
data[2:5]

Unnamed: 0,content,source,ori_source
2,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...,0,快科技@http://www.kkj.cn/
3,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n,1,新华社
4,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,0,深圳大件事


##### feature selecting v.2

In [10]:
corpus = data['content'].tolist()
label =  data['source'].values.tolist()

In [11]:
print(len(corpus))
print(len(label))

87052
87052


##### token

In [14]:
corpus_final = []
for article in tqdm(corpus, position=0):
    article_re = ''.join(re.findall(r'\w+',article)) # re matching
    corpus_final.append(' '.join(jieba.cut(article_re))) # token

  0%|          | 0/87052 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/d6/lxfh6jmn7_1f52m5fmwm18hw0000gn/T/jieba.cache
Loading model cost 0.652 seconds.
Prefix dict has been built succesfully.
100%|██████████| 87052/87052 [01:47<00:00, 810.47it/s] 


In [17]:
corpus_final[0]

'此外 自 本周 6 月 12 日起 除 小米 手机 6 等 15 款 机型 外 其余 机型 已 暂停 更新 发布 含 开发 版 体验版 内测 稳定版 暂不受 影响 以 确保 工程师 可以 集中 全部 精力 进行 系统优化 工作 有人 猜测 这 也 是 将 精力 主要 用到 MIUI9 的 研发 之中 MIUI8 去年 5 月 发布 距今已有 一年 有余 也 是 时候 更新换代 了 当然 关于 MIUI9 的 确切 信息 我们 还是 等待 官方消息'

# Text Vectorization

In [19]:
vectorizer = TfidfVectorizer(max_features=300) # 备选参数ngram_range=(1,2or3)

In [20]:
X = vectorizer.fit_transform(corpus_final)

In [23]:
print(type(X))
print(X.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(87052, 300)


In [35]:
y = label

In [27]:
len(y)

87052

# Build Model

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

##### Train set adn Test set Splitting

In [29]:
X_train, x_test, Y_train, y_test = train_test_split(X, y,
                                                   test_size=0.15,
                                                   random_state=41)

In [31]:
print(X_train.shape)
print(x_test.shape)

(73994, 300)
(13058, 300)


In [37]:
print(len(Y_train))
print(len(y_test))

73994
13058


##### SVM

从结果的角度，LinearSVC和使用SVC且kernel传入linear，结果是一致的。但是由于LinearSVC只能计算线性核，而SVC可以计算任意核，所以，他们的底层计算方式不一样，这使得同样使用线性核的SVC，用LinearSVC的计算速度，要比用SVC且kernel传入linear参数，快很多。



所以，整体而言，如果你决定使用线性SVM，就使用LinearSVC，但如果你要是用其他核的SVM，就只能使用SVC：）



P.S. LineaSVR和SVR同理：）

In [39]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train,
                                                     test_size=0.15,
                                                     random_state=42)

In [43]:
print(x_train.shape)
print(x_valid.shape)

(62894, 300)
(11100, 300)


In [98]:
lsvc = LinearSVC(dual=False, penalty='l1')

In [99]:
lsvc.fit(x_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)

In [100]:
lsvc.score(x_valid, y_valid)

0.984954954954955

In [101]:
y_pred = lsvc.predict(x_valid)
y_pred_proba = lsvc.decision_function(x_valid) # Predict confidence scores for samples.

In [102]:
precision_score(y_valid, y_pred)

0.9916192756659683

In [103]:
recall_score(y_valid,y_pred)

0.9917182199161844

In [104]:
f1_score(y_valid, y_pred)

0.9916687453230232

In [105]:
roc_auc_score(y_valid, y_pred_proba)

0.995496780922416

##### Optimize the SVM Parameters via the grid-search

In [167]:
from sklearn.model_selection import GridSearchCV

In [168]:
param_grid = [
    {'dual':[False], 'C':[1,3,5,7], 'penalty':['l1','l2']},
    {'dual':[False], 'verbose':[0,2,4], 'max_iter':[1000,10000]}
]
lsvm = LinearSVC()
grid_search_lsvm = GridSearchCV(lsvm, param_grid, cv=5)
grid_search_lsvm.fit(x_train, y_train)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'dual': [False], 'C': [1, 3, 5, 7], 'penalty': ['l1', 'l2']}, {'dual': [False], 'verbose': [0, 2, 4], 'max_iter': [1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [169]:
grid_search_lsvm.best_params_

{'C': 1, 'dual': False, 'penalty': 'l1'}

In [170]:
grid_search_lsvm.best_estimator_

LinearSVC(C=1, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)

In [116]:
cvres = grid_search_lsvm.cv_results_

for mean_score, params in zip(cvres['mean_train_score'], cvres['params']):
    print(mean_score, params)

0.9861592544914413 {'C': 1, 'dual': False, 'penalty': 'l1'}
0.9841161351266653 {'C': 1, 'dual': False, 'penalty': 'l2'}
0.9861910542325697 {'C': 3, 'dual': False, 'penalty': 'l1'}
0.9851416696483195 {'C': 3, 'dual': False, 'penalty': 'l2'}
0.9861473300131424 {'C': 5, 'dual': False, 'penalty': 'l1'}
0.9854676150791393 {'C': 5, 'dual': False, 'penalty': 'l2'}
0.9861513049709085 {'C': 7, 'dual': False, 'penalty': 'l1'}
0.9856345626733154 {'C': 7, 'dual': False, 'penalty': 'l2'}
0.9841161351266653 {'dual': False, 'max_iter': 1000, 'verbose': 0}
0.9841161351266653 {'dual': False, 'max_iter': 1000, 'verbose': 2}
0.9841161351266653 {'dual': False, 'max_iter': 1000, 'verbose': 4}
0.9841161351266653 {'dual': False, 'max_iter': 10000, 'verbose': 0}
0.9841161351266653 {'dual': False, 'max_iter': 10000, 'verbose': 2}
0.9841161351266653 {'dual': False, 'max_iter': 10000, 'verbose': 4}


##### Random Forest

In [119]:
rfc = RandomForestClassifier()

In [120]:
rfc.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [151]:
rfc.score(x_valid, y_valid)

0.9891891891891892

In [152]:
y_pred = rfc.predict(x_valid)
y_pred_prob = rfc.predict_proba(x_valid)

In [153]:
precision_score(y_valid, y_pred)

0.993029277036447

In [128]:
recall_score(y_valid, y_pred)

0.9950109758531231

In [129]:
f1_score(y_valid, y_pred)

0.994019138755981

In [132]:
roc_auc_score(y_valid, y_pred_prob[:,1])

0.9926390604862252

##### Optimize the RandomForest parameters via GridSearch

In [133]:
param_grid = [
    {'n_estimators':[3,10,30], 'max_features':[2,4,6,8]},
]
forest_clf = RandomForestClassifier()
grid_search_rfc = GridSearchCV(forest_clf, param_grid, cv=5)
grid_search_rfc.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [134]:
grid_search_rfc.best_params_

{'max_features': 8, 'n_estimators': 30}

In [136]:
forest_clf = RandomForestClassifier(max_features=8, n_estimators=30)

In [137]:
forest_clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [166]:
forest_clf.score(x_valid, y_valid)

0.9902702702702703

In [155]:
y_pred = forest_clf.predict(x_valid)
y_pred_prob = forest_clf.predict_proba(x_valid)

In [157]:
precision_score(y_valid, y_pred)

0.9923520063567739

In [158]:
recall_score(y_valid, y_pred)

0.9969068050289364

In [159]:
f1_score(y_valid, y_pred)

0.9946241911398707

In [160]:
roc_auc_score(y_valid, y_pred_prob[:,1])

0.9964916238079564

##### K-Means

In [161]:
from sklearn.cluster import KMeans

In [172]:
k_means = KMeans()

In [180]:
k_means.fit(X_train) # 本机训练耗时约6分钟

KeyboardInterrupt: 

In [179]:
# Test Demo K-means from official documentation
from sklearn.cluster import KMeans
import numpy as np

X = np.array([[1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0]])

kmeans = KMeans(n_clusters=3, random_state=43).fit(X)
print(kmeans.labels_) # 蔟

print(kmeans.predict([[0, 0], [12, 3]]))

kmeans.cluster_centers_ # 蔟心


[1 1 1 0 0 2]
[1 0]


array([[10.,  3.],
       [ 1.,  2.],
       [10.,  0.]])

In [183]:
k_means.score(x_test) # k_maens 评判标准？

-10045.84627499915

### 各模型的优缺点

1. Linear Regression
1. 优点：速度快，模型易于解释
2. 缺点：对异常值敏感，无法拟合复杂的非线性关系

1. Logistic Regression
1. 优点：结果是介于0-1之间的数值可以看作是概率；可以适用于连续数值和标称类别的变量；模型易于解释；计算代价不高，易于实现
2. 缺点：容易前拟合；分类精度可能不高

1. Decision Tree
1. 优点：计算复杂度低；模型极易于解释；对中间值的缺失不敏感；可以处理不相关的特征；适用与数值型和标称类别型数据
2. 缺点：可能产生过度匹配和过拟合的问题

1. K-means聚类
1. 优点：容易实现
2. 缺点：大规模数据集上收敛较慢，可能收敛到局部最少值，只适用数值型数据

1. Random Forest
1. 优点：性能相比其他模型较好；抗过拟合能力强
2. 缺点：黑盒难以解释

1. Naive Bayes

1. SVM

1. KNN