In [42]:
import  pandas as pd
import numpy as np
import tensorflow as tf
import re
import keras as kr
from keras.models import Model
from collections import Counter
from tensorflow.contrib import learn
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D,Input, concatenate
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape
import nltk

In [43]:
#读取数据
dir_1 = r'all_data_new_1.csv'

df = pd.read_csv(dir_1)
print(df.shape)
df.head()

(16251, 3)


Unnamed: 0,text,title,type
0,"They stood in line at Trump Tower, sometimes u...","At Donald Trump<U+2019>s Properties, a Showcas...",1
1,Donald J. Trump <U+2019>s foundation informed ...,Trump Foundation Tells New York It Has Stopped...,1
2,President-elect Donald J. Trump won the White ...,"Donald Trump Prepares for White House Move, bu...",1
3,An investment pitch for a new Texas hotel is t...,Luring Chinese Investors With Trump<U+2019>s N...,1
4,President-elect Donald J. Trump <U+2019>s wife...,Melania and Barron Trump Won<U+2019>t Immediat...,1


In [44]:
df['type'].unique()

array([1, 0])

In [45]:
#转换数据类型
df['type'] = df['type'].astype(int)
print(df['type'][:5])

0    1
1    1
2    1
3    1
4    1
Name: type, dtype: int64


In [63]:
df['text'][1]

'Donald J. Trump <U+2019>s foundation informed Attorney General Eric T. Schneiderman of New York that it has ceased soliciting donations in the state, after a recent order to halt such fund-raising efforts, a spokesman for the attorney general<U+2019>s office said on Monday. The charity, the Donald J. Trump Foundation, also requested and was granted an extension for filing its financial paperwork, including audits, said Eric Soufer, the spokesman. The developments come about two weeks after the attorney general<U+2019>s office ordered the foundation to stop soliciting charitable contributions in the state of New York, saying that the nonprofit was not properly registered to do so under state law. A <U+201C>notice of violation<U+201D> letter sent to the foundation on Sept. 30 required it to submit financial documents within 15 days, or <U+201C>be deemed to be a continuing fraud upon the people of the state of New York.<U+201D> The foundation has become a source of controversy during Mr.

**data preprocessing**

In [47]:
#去除标点等无用的符号以及停用词
def cleanlines(line,remove_stopwords=False):   
    p1=re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-')
    p2=re.compile(r'[(][: @ . , ？！\s][)]')
    p3=re.compile(r'[「『]')
    p4=re.compile(r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）0-9 , : ; \-\ \[\ \]\ ]')
    p5=re.compile(r'[^a-zA-Z]')
    line=p1.sub(r' ',line)
    line=p2.sub(r' ',line)
    line=p3.sub(r' ',line)
    line=p4.sub(r' ',line)
    line=p5.sub(r' ',line)
    words = line.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

In [48]:
#添加停用词表
eng_stopwords = [line.rstrip() for line in open('stop_words.txt')]
eng_stopwords[:5]

["'d", "'ll", "'m", "'re", "'s"]

In [49]:
#处理内容文本
df['text'] = df['text'].astype(str)
sents = df['text']
text_words=[cleanlines(sent,remove_stopwords=True) for sent in sents]

In [50]:
len(text_words)
print(text_words[:5])

[['stood', 'trump', 'tower', 'half', 'hour', 'handing', 'cash', 'mementos', 'president', 'elect', 'mini', 'gold', 'wrapped', 'chocolate', 'bricks', 'stamped', 'trump', 'trump', 'monogrammed', 'sweaters', 'towels', 'glassware', 'trump', 'cologne', 'bought', 'sons', 'shanon', 'loggins', 'lufkin', 'tex', 'golden', 'shopping', 'bag', 'embossed', 'trump', 'crest', 'carried', 'bottles', 'success', 'trump', 'fragrance', 'successful', 'explained', 'business', 'donald', 'trump', 'people', 'flocking', 'midtown', 'manhattan', 'skyscraper', 'dining', 'restaurants', 'buying', 'wares', 'reporters', 'fastidiously', 'chronicling', 'comings', 'goings', 'transition', 'team', 'branded', 'properties', 'providing', 'backdrop', 'television', 'live', 'shots', 'trump', 'staid', 'task', 'preparing', 'assume', 'presidency', 'exercise', 'conspicuous', 'promotion', 'carefully', 'choreographed', 'branding', 'president', 'elect', 'vast', 'real', 'estate', 'holdings', 'raising', 'questions', 'exploiting', 'profile',

In [51]:
#处理标题文本
df['title'] = df['title'].astype(str)
title_sents = df['title']
title_words=[cleanlines(sent,remove_stopwords=True) for sent in title_sents]

In [52]:
#讲内容和标题连接在一起
def concat_text(text_words,title_words):
    concat_text = []
    for i in range(len(text_words)):
        temp = text_words[i]+title_words[i]
        concat_text.append(temp)
    return concat_text

concat_text = concat_text(text_words,title_words)
        

In [53]:
len(concat_text)

16251

In [None]:
dict = {'a': 1, 'b': 2, 'b': '3'}

*build dict*

In [14]:
# set parameters:
#字典大小
vocab_size = 5000 #dict size
batch_size = 200 #batch size for each training
sequence_length = 300 #sequence length -same for every sample
embedding_dims=128 #embed dim

In [54]:
#build dic
def build_vocab(content_list,vocab_size):
    vocab_size = vocab_size
    all_vocabs = []
    for content in content_list:
        all_vocabs.extend(list(content))
    #{word:counts}
    counter = Counter(all_vocabs)
    #take the most common top 4999
    count_pairs = counter.most_common(vocab_size - 1)  #return pairs
    print(count_pairs)
    words, _ = list(zip(*count_pairs))    #ignore the counts
    words = ['<PAD>'] + list(words)  #防止样本数太小 填充为0 
    word_to_id = dict(zip(words, range(len(words))))  #change pos to 编号 number
    return words, word_to_id

In [55]:
words, word_to_id = build_vocab(concat_text,vocab_size)
#print(word_to_id)



In [17]:
print(len(word_to_id))

5000


In [18]:
#padding the samples sequence length = 300
def get_pad_text(text_words, word_to_id, max_length=100):   
    data_id=[]
    for x in text_words:
        single_text=[]
        for i in x:
            if i in words:
                a=word_to_id[i]
                single_text.append(a)
        data_id.append(single_text)
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    return np.array(x_pad)

X = get_pad_text(concat_text,word_to_id,max_length=sequence_length)

In [19]:
print(X.shape)
print(X)

(16251, 300)
[[  65  823 3439 ... 1635    5  136]
 [   0    0    0 ... 1452   39 1505]
 [  56    1 2746 ...   23  177 1547]
 ...
 [   0    0    0 ...    0 1376 1667]
 [   0    0    0 ...  533  787 1667]
 [   0    0    0 ...    0    0 1667]]


**划分训练集和线下验证集**

In [20]:
def train_dev_split(X,y):
    #每次的随机都保证一样
    np.random.seed(10)
    shuffle_indices = np.random.permutation(len(y))
    x_shuffled = X[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    # Split train/test set
    # 后10%作为验证集
    dev_sample_index = -1 * int(0.1 * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, y_train, word_to_id, x_dev, y_dev

In [21]:
y = np.array(df['type'])
#x_train, y_train, word_to_id, x_dev, y_dev = train_dev_split(X,y)
from sklearn.model_selection import train_test_split

x_train,x_val,y_train,y_val = train_test_split(X,y,test_size=0.2,random_state=10,stratify=y)
print(y_val[:100])

[1 0 0 0 0 1 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0
 1 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 0 0 1 0 1 0]


**TextCNN 模型**

In [22]:
#Inputs:Input函数只需要填写列数
comment_seq = Input(shape=[sequence_length],name='x_seq')  #shape is (number of samples eg:10000,300) 
print(comment_seq.shape)
#embedding layers
embedding_layer = Embedding(len(word_to_id), embedding_dims)(comment_seq)   #填充层，Embedding(len(word_to_id), embedding_dims) 内置 5000(len word_to_id) 0....4999*128 random vector
# comment_seq 放入变成 10000*300*128
print(embedding_layer.shape)

(?, 300)
(?, 300, 128)


In [23]:
#在原来的基础上添加一个维度，相当于 类似图片数据(number sequence,r,g,b) 二维精度高 考虑了前后
'''Reshape Layer'''
reshape = Reshape(target_shape=(sequence_length, embedding_dims, 1), name='reshape')(embedding_layer) # chanels last
print(reshape.shape)
print("Builded Reshape Layer...")

(?, 300, 128, 1)
Builded Reshape Layer...


In [24]:
#使用三种filter进行过滤，将最后的结果flatten  text 只能上下移动 filter
nb_filters = 100   
filter_lengths = [3, 4, 5]   # 3*128 , 4*128, 5*128
flatten_ = []

for i in filter_lengths:
    model_internal = Sequential()
    model_internal.add(Convolution2D(nb_filters, (i, embedding_dims), activation="relu", name='conv2d_' + str(i), input_shape=(sequence_length, embedding_dims, 1)))
# chanels last,默认了strides=(1,1), padding='valid'
    model_internal.add(MaxPooling2D(pool_size=(sequence_length - i + 1, 1), name='maxpool2d_' + str(i)))
    model_internal.add(Flatten())   #flatten 去除刚刚加的最后的一纬
    flatten = model_internal(reshape)  
    #print(flatten.shape)
    flatten_.append(flatten)   #list of 三个矩阵

In [25]:
#Fully Connect Layer & Dropout Layer
dropout_rate = 0.5   
hidden_nodes = 256   
merge = concatenate(flatten_, axis=-1)    

fully_connect = Dense(hidden_nodes, activation='relu', name='fully_connect')(merge) 
dropout = Dropout(dropout_rate, name='dropout')(fully_connect)

print("Builded Fully Connect Layer & Dropout Layer...")

'''Projection Layer & Output Layer'''

output = Dense(1, activation='sigmoid', name='output')(dropout) # output layer

print("Builded  Output Layer...")

Builded Fully Connect Layer & Dropout Layer...
Builded  Output Layer...


In [26]:
model = Model([comment_seq],output)

In [27]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy']) #binary classification

In [28]:
from keras.callbacks import EarlyStopping,ModelCheckpoint

es = EarlyStopping(monitor='val_acc',patience=5)  #val patience times

filepath="weights.bests.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,mode='max')
callbacks_list = [es,checkpoint]

In [29]:
model.fit(x_train, y_train,
          validation_split=0.1,
          batch_size=batch_size,   
          callbacks=[es],
          epochs=5,
          shuffle=True)

Train on 11700 samples, validate on 1300 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1c35116e48>

In [30]:
pred_y = model.predict(x_val)

In [35]:
predict = []
for i in range(0,len(pred_y)):
    predict.append(pred_y[i][0])
result = pd.DataFrame(columns=['predict', 'label'])
result['predict'] = predict
result['label'] = y_val
result
result.to_csv(r'result_new.csv' ,index=False)

In [31]:
def Chang_(pred):
    results = []
    for i in range(len(pred)):
        if pred[i]>0.5:
            temp = 1
            results.append(temp)
        else:
            temp = 0
            results.append(temp)
    return results
        
results = Chang_(pred_y)

In [32]:
len(results)

3251

In [33]:
len(y_val)

3251

In [34]:

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

In [35]:
# Compute confusion matrix
con_matrix = confusion_matrix(results, y_val,labels=[0,1])  # 生成混淆矩阵
print(con_matrix)

[[2266   21]
 [   4  960]]


In [36]:
print('precision:',precision_score(y_val, results))
print('recall:',recall_score(y_val, results))
print('f1:',f1_score(y_val, results))

precision: 0.995850622406639
recall: 0.9785932721712538
f1: 0.9871465295629821


In [49]:
y_scores = pd.DataFrame(pred_y)[0].values
print(y_scores[:2])

[9.9992692e-01 5.0349813e-06]


In [41]:
y_scores = pd.DataFrame(results)[0].values
print(y_scores)
auc_value = roc_auc_score(y_val, y_scores)
print(auc_value)

[1 0 0 ... 0 1 0]
0.9884155788169044


In [51]:
print(classification_report(y_val, results))
auc_value = roc_auc_score(y_val, y_scores)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2270
           1       1.00      0.97      0.99       981

   micro avg       0.99      0.99      0.99      3251
   macro avg       0.99      0.99      0.99      3251
weighted avg       0.99      0.99      0.99      3251



In [52]:
import matplotlib.pyplot as plt
%matplotlib inline

In [53]:
fpr, tpr, thresholds = roc_curve(y_dev, y_scores, pos_label=1.0)
plt.figure(figsize=(6,6))
lw = 2
plt.plot(fpr, tpr, color='darkorange', linewidth=lw, label='ROC curve (area = %0.4f)' % auc_value)
plt.plot([0, 1], [0, 1], color='navy', linewidth=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('CNN ROC')
plt.legend(loc="lower right")

NameError: name 'y_dev' is not defined

In [None]:
text_new = 'WASHINGTON — The Trump administration pushed a $1.5 trillion tax cut through Congress in 2017 on the promise that it would spark sustained economic growth. While the tax cuts have goosed the economy in the short term, officials now concede they will not be enough to deliver the 3 percent annual growth the president promised over the long term.To produce that average growth rate for the next decade, White House forecasters say, the American economy would need additional rollbacks in labor regulations, a $1 trillion infrastructure plan and another round of tax cuts.Getting all those policies implemented would be highly unlikely, given a divided Congress and a ballooning federal deficit, which could limit lawmakers’ appetite to spend money on a new tax cut or infrastructure plan.But without those additional steps, the president’s economic team predicts in a report released on Tuesday that growth would slow to about 2 percent a year in 2026. That is the year when many of the individual tax cuts included in the 2017 law are set to expire, essentially producing a tax increase for millions of Americans.'

In [None]:
text_words_new=[cleanlines(text_new,remove_stopwords=True)]


In [None]:
print(text_words_new[0])

In [None]:
X_new = get_pad_text(text_words_new,word_to_id,max_length=sequence_length)
print(X_new.shape)

In [None]:
X_new

In [None]:
model.predict(X_new)

In [None]:
dir_2 = r'New_Data.csv'
test_set = pd.read_csv(dir_2)
test_set = test_set[['title','content','label']]
test_set.head()

In [None]:
test_set['label'] = test_set['label'].astype(int)


In [None]:
test_set['content'] = test_set['content'].astype(str)
sents = test_set['content']
text_words=[cleanlines(sent,remove_stopwords=True) for sent in sents]
print(text_words[:5])

In [None]:
test_set['title'] = test_set['title'].astype(str)
sents = test_set['title']
title_words=[cleanlines(sent,remove_stopwords=True) for sent in sents]
print(title_words[:5])

In [None]:
def concat_text(text_words,title_words):
    concat_text = []
    for i in range(len(text_words)):
        temp = text_words[i]+title_words[i]
        concat_text.append(temp)
    return concat_text
concat_text = concat_text(text_words,title_words)

In [None]:
X = get_pad_text(concat_text,word_to_id,max_length=sequence_length)

In [None]:
y = np.array(test_set['label'])


In [None]:
pred_y = model.predict(X)
test_set['predict'] = pred_y
print(test_set.head())
test_set.to_csv(r'1.csv',header=True, index=False)


In [None]:
def Chang_(pred):
    results = []
    for i in range(len(pred)):
        if pred[i]>0.5:
            temp = 1
            results.append(temp)
        else:
            temp = 0
            results.append(temp)
    return results
        
results = Chang_(pred_y)

In [None]:
con_matrix = confusion_matrix(results, y,labels=[0,1])  # 生成混淆矩阵
print(con_matrix)

In [None]:
print('precision:',precision_score(y, results))
print('recall:',recall_score(y, results))
print('f1:',f1_score(y, results))