In [50]:
import  pandas as pd
import numpy as np
import tensorflow as tf
import re
import keras as kr
from keras.models import Model
from collections import Counter
from tensorflow.contrib import learn
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D,Input, concatenate
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape
import nltk

In [51]:
#读取数据
dir_1 = r'New_Data.csv'

dir_2 = r'all_data_new_1.csv'

df = pd.read_csv(dir_1)
df2 = pd.read_csv(dir_2)
df['text'] = df['content']
df['type'] = df['label']
df = df[['text','title','type']]
print(df.shape)
df.head()

(439, 3)


Unnamed: 0,text,title,type
0,"Inspiring women take center stage in ''Women, ...","Inspiring women take center stage in ''Women, ...",1
1,"Imagine Europe as a target on a radar screen, ...",Europe: Prey or Power?,1
2,A rocket launched from Gaza struck a house in ...,Netanyahu Cuts Short U.S. Trip After Gaza Rock...,1
3,"Before ''Southern Promises'' begins, the actor...",Scratching the Surface of Slavery,1
4,Four University of Georgia students were expel...,Pick My Cotton': Video of Mock Whipping Prompt...,1


In [52]:
df['type'].unique()

array([1, 0])

In [53]:
#转换数据类型
df['type'] = df['type'].astype(int)
df2['type'] = df2['type'].astype(int)
print(df['type'][:5])

0    1
1    1
2    1
3    1
4    1
Name: type, dtype: int64


In [54]:
df['text'][1]



**data preprocessing**

In [55]:
#去除标点等无用的符号以及停用词
def cleanlines(line,remove_stopwords=False):   
    p1=re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-')
    p2=re.compile(r'[(][: @ . , ？！\s][)]')
    p3=re.compile(r'[「『]')
    p4=re.compile(r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）0-9 , : ; \-\ \[\ \]\ ]')
    p5=re.compile(r'[^a-zA-Z]')
    line=p1.sub(r' ',line)
    line=p2.sub(r' ',line)
    line=p3.sub(r' ',line)
    line=p4.sub(r' ',line)
    line=p5.sub(r' ',line)
    words = line.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

In [56]:
#添加停用词表
eng_stopwords = [line.rstrip() for line in open('stop_words.txt')]
eng_stopwords[:5]

["'d", "'ll", "'m", "'re", "'s"]

In [57]:
#处理内容文本
df['text'] = df['text'].astype(str)
sents = df['text']
text_words=[cleanlines(sent,remove_stopwords=True) for sent in sents]

In [58]:
len(text_words)
print(text_words[:5])



In [59]:
#处理标题文本
df['title'] = df['title'].astype(str)
title_sents = df['title']
title_words=[cleanlines(sent,remove_stopwords=True) for sent in title_sents]

In [60]:
#讲内容和标题连接在一起
def concat_text(text_words,title_words):
    concat_text = []
    for i in range(len(text_words)):
        temp = text_words[i]+title_words[i]
        concat_text.append(temp)
    return concat_text

concat_text = concat_text(text_words,title_words)
        

In [61]:

len(concat_text)

439

*build dict*

In [62]:
# set parameters:
#字典大小
vocab_size = 5000 #dict size
batch_size = 200 #batch size for each training
sequence_length = 300 #sequence length -same for every sample
embedding_dims=128 #embed dim

In [63]:
#build dic
def build_vocab(content_list,vocab_size):
    vocab_size = vocab_size
    all_vocabs = []
    for content in content_list:
        all_vocabs.extend(list(content))
    #{word:counts}
    counter = Counter(all_vocabs)
    #take the most common top 4999
    count_pairs = counter.most_common(vocab_size - 1)  #return pairs
    words, _ = list(zip(*count_pairs))    #ignore the counts
    words = ['<PAD>'] + list(words)  #防止样本数太小 填充为0 
    word_to_id = dict(zip(words, range(len(words))))  #change pos to 编号 number
    return words, word_to_id

In [64]:
words, word_to_id = build_vocab(concat_text,vocab_size)
print(word_to_id)



In [65]:
print(len(word_to_id))

5000


In [66]:
#padding the samples sequence length = 300
def get_pad_text(text_words, word_to_id, max_length=100):   
    data_id=[]
    for x in text_words:
        single_text=[]
        for i in x:
            if i in words:
                a=word_to_id[i]
                single_text.append(a)
        data_id.append(single_text)
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    return np.array(x_pad)

X = get_pad_text(concat_text,word_to_id,max_length=sequence_length)

In [67]:
X.shape
print(X)

[[   0    0    0 ... 1486  475   41]
 [3586  788  402 ...  191  402  170]
 [   0    0    0 ... 3144 4192 4193]
 ...
 [  18   21  691 ... 1894  252   56]
 [   0    0    0 ...    5 2925   79]
 [ 264   71  390 ...  210  252   56]]


**划分训练集和线下验证集**

In [68]:
def train_dev_split(X,y):
    #每次的随机都保证一样
    np.random.seed(10)
    shuffle_indices = np.random.permutation(len(y))
    x_shuffled = X[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    # Split train/test set
    # 后10%作为验证集
    dev_sample_index = -1 * int(0.1 * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, y_train, word_to_id, x_dev, y_dev

In [72]:
y = np.array(df['type'])
from sklearn.model_selection import train_test_split

x_train,x_val,y_train,y_val = train_test_split(X,y,test_size=0.2,random_state=10,stratify=y)
print(y_val[:20])

[0 0 0 1 0 0 1 1 1 1 1 1 1 0 0 1 0 1 0 0]


**TextCNN 模型**

In [73]:
#Inputs:Input函数只需要填写列数
comment_seq = Input(shape=[sequence_length],name='x_seq')  #shape is (number of samples eg:10000,300) 
print(comment_seq.shape)
#embedding layers
embedding_layer = Embedding(len(word_to_id), embedding_dims)(comment_seq)   #填充层，Embedding(len(word_to_id), embedding_dims) 内置 5000(len word_to_id) 0....4999*128 random vector
# comment_seq 放入变成 10000*300*128
print(embedding_layer.shape)

(?, 300)
(?, 300, 128)


In [74]:
#在原来的基础上添加一个维度，相当于 类似图片数据(number sequence,r,g,b) 二维精度高 考虑了前后
'''Reshape Layer'''
reshape = Reshape(target_shape=(sequence_length, embedding_dims, 1), name='reshape')(embedding_layer) # chanels last
print(reshape.shape)
print("Builded Reshape Layer...")

(?, 300, 128, 1)
Builded Reshape Layer...


In [75]:
#使用三种filter进行过滤，将最后的结果flatten  text 只能上下移动 filter
nb_filters = 100   
filter_lengths = [3, 4, 5]   # 3*128 , 4*128, 5*128
flatten_ = []

for i in filter_lengths:
    model_internal = Sequential()
    model_internal.add(Convolution2D(nb_filters, (i, embedding_dims), activation="relu", name='conv2d_' + str(i), input_shape=(sequence_length, embedding_dims, 1)))
# chanels last,默认了strides=(1,1), padding='valid'
    model_internal.add(MaxPooling2D(pool_size=(sequence_length - i + 1, 1), name='maxpool2d_' + str(i)))
    model_internal.add(Flatten())   #flatten 去除刚刚加的最后的一纬
    flatten = model_internal(reshape)  
    #print(flatten.shape)
    flatten_.append(flatten)   #list of 三个矩阵

In [76]:
#Fully Connect Layer & Dropout Layer
dropout_rate = 0.5   
hidden_nodes = 256   
merge = concatenate(flatten_, axis=-1)    

fully_connect = Dense(hidden_nodes, activation='relu', name='fully_connect')(merge) 
dropout = Dropout(dropout_rate, name='dropout')(fully_connect)

print("Builded Fully Connect Layer & Dropout Layer...")

'''Projection Layer & Output Layer'''

output = Dense(1, activation='sigmoid', name='output')(dropout) # output layer

print("Builded  Output Layer...")

Builded Fully Connect Layer & Dropout Layer...
Builded  Output Layer...


In [77]:
model = Model([comment_seq],output)

In [78]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy']) #binary classification

In [79]:
from keras.callbacks import EarlyStopping,ModelCheckpoint

es = EarlyStopping(monitor='val_acc',patience=5)  #val patience times

filepath="weights.bests.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,mode='max')
callbacks_list = [es,checkpoint]

In [80]:
model.fit(x_train, y_train,
          validation_split=0.1,
          batch_size=batch_size,   
          callbacks=[es],
          epochs=5,
          shuffle=True)

Train on 315 samples, validate on 36 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x10e16d6a0>

In [87]:
pred_y = model.predict(x_val)
print(len(pred_y))
predict = []
for i in range(0,len(pred_y)):
    predict.append(pred_y[i][0])
result = pd.DataFrame(columns=['predict', 'label'])
result['predict'] = predict
result['label'] = y_val
result
result.to_csv(r'result_new_2.csv' ,index=False)

88


In [88]:
pred_y = model.predict(x_val)
def Chang_(pred):
    results = []
    for i in range(len(pred)):
        if pred[i]>0.5:
            temp = 1
            results.append(temp)
        else:
            temp = 0
            results.append(temp)
    return results
        
results = Chang_(pred_y)

In [89]:

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

In [91]:
# Compute confusion matrix
con_matrix = confusion_matrix(results, y_val,labels=[0,1])  # 生成混淆矩阵
print(con_matrix)

[[ 0  0]
 [41 47]]


In [93]:
print('precision:',precision_score(y_val, results))
print('recall:',recall_score(y_val, results))
print('f1:',f1_score(y_val, results))

precision: 0.5340909090909091
recall: 1.0
f1: 0.6962962962962962


In [94]:
y_scores = pd.DataFrame(pred_y)[0].values
print(y_scores[:2])

[0.562688  0.5724812]


In [95]:
y_scores = pd.DataFrame(results)[0].values
print(y_scores)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [98]:
auc_value = roc_auc_score(y_val, y_scores)
print(auc_value)

0.5


In [47]:
pred_y = model.predict(x_dev)
predict = []
for i in range(0,len(pred_y)):
    predict.append(pred_y[i][0])
result = pd.DataFrame(columns=['predict', 'label'])
result['predict'] = predict
result['label'] = y_dev
result
result.to_csv(r'result_new.csv' ,index=False)

In [56]:
df2['text'] = df2['text'].astype(str)
sents = df2['text']
text_words=[cleanlines(sent,remove_stopwords=True) for sent in sents]

In [57]:
df2['title'] = df2['title'].astype(str)
title_sents = df2['title']
title_words=[cleanlines(sent,remove_stopwords=True) for sent in title_sents]

In [58]:
#讲内容和标题连接在一起
def concat_text(text_words,title_words):
    concat_text = []
    for i in range(len(text_words)):
        temp = text_words[i]+title_words[i]
        concat_text.append(temp)
    return concat_text

concat_text = concat_text(text_words,title_words)


In [60]:
len(concat_text)

16251

In [61]:
vocab_size = 5000 #dict size
batch_size = 200 #batch size for each training
sequence_length = 300 #sequence length -same for every sample
embedding_dims=128 #embed dim

In [62]:
#build dic
def build_vocab(content_list,vocab_size):
    vocab_size = vocab_size
    all_vocabs = []
    for content in content_list:
        all_vocabs.extend(list(content))
    #{word:counts}
    counter = Counter(all_vocabs)
    #take the most common top 4999
    count_pairs = counter.most_common(vocab_size - 1)  #return pairs
    words, _ = list(zip(*count_pairs))    #ignore the counts
    words = ['<PAD>'] + list(words)  #防止样本数太小 填充为0 
    word_to_id = dict(zip(words, range(len(words))))  #change pos to 编号 number
    return words, word_to_id

In [63]:
words, word_to_id = build_vocab(concat_text,vocab_size)
print(word_to_id)



In [64]:
#padding the samples sequence length = 300
def get_pad_text(text_words, word_to_id, max_length=100):   
    data_id=[]
    for x in text_words:
        single_text=[]
        for i in x:
            if i in words:
                a=word_to_id[i]
                single_text.append(a)
        data_id.append(single_text)
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    return np.array(x_pad)

X = get_pad_text(concat_text,word_to_id,max_length=sequence_length)

In [65]:
def train_dev_split(X,y):
    #每次的随机都保证一样
    np.random.seed(10)
    shuffle_indices = np.random.permutation(len(y))
    x_shuffled = X[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    # Split train/test set
    # 后10%作为验证集
    dev_sample_index = -1 * int(0.1 * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, y_train, word_to_id, x_dev, y_dev

In [66]:
y = np.array(df2['type'])
x_train, y_train, word_to_id, x_dev, y_dev = train_dev_split(X,y)

Train/Dev split: 14626/1625


In [72]:
pred_y = model.predict(x_dev)
pred_y = model.predict(x_dev)
predict = []
for i in range(0,len(pred_y)):
    predict.append(pred_y[i][0])
result = pd.DataFrame(columns=['predict', 'label'])
result['predict'] = predict
result['label'] = y_dev
result
result.to_csv(r'result_new2.csv' ,index=False)

In [68]:
def Chang_(pred):
    results = []
    for i in range(len(pred)):
        if pred[i]>0.5:
            temp = 1
            results.append(temp)
        else:
            temp = 0
            results.append(temp)
    return results
        
results = Chang_(pred_y)


In [69]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

In [70]:
# Compute confusion matrix
con_matrix = confusion_matrix(results, y_dev,labels=[0,1])  # 生成混淆矩阵
print(con_matrix)

[[   0    0]
 [1154  471]]


In [71]:
print('precision:',precision_score(y_dev, results))
print('recall:',recall_score(y_dev, results))
print('f1:',f1_score(y_dev, results))

precision: 0.28984615384615386
recall: 1.0
f1: 0.4494274809160306


In [30]:
def Chang_(pred):
    results = []
    for i in range(len(pred)):
        if pred[i]>0.5:
            temp = 1
            results.append(temp)
        else:
            temp = 0
            results.append(temp)
    return results
        
results = Chang_(pred_y)

In [137]:
len(results)

43

In [138]:
len(y_dev)

43

In [139]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

In [140]:
# Compute confusion matrix
con_matrix = confusion_matrix(results, y_dev,labels=[0,1])  # 生成混淆矩阵
print(con_matrix)

[[ 0  0]
 [16 27]]


In [141]:
print('precision:',precision_score(y_dev, results))
print('recall:',recall_score(y_dev, results))
print('f1:',f1_score(y_dev, results))

precision: 0.627906976744186
recall: 1.0
f1: 0.7714285714285715


In [47]:
text_new = 'WASHINGTON — The Trump administration pushed a $1.5 trillion tax cut through Congress in 2017 on the promise that it would spark sustained economic growth. While the tax cuts have goosed the economy in the short term, officials now concede they will not be enough to deliver the 3 percent annual growth the president promised over the long term.To produce that average growth rate for the next decade, White House forecasters say, the American economy would need additional rollbacks in labor regulations, a $1 trillion infrastructure plan and another round of tax cuts.Getting all those policies implemented would be highly unlikely, given a divided Congress and a ballooning federal deficit, which could limit lawmakers’ appetite to spend money on a new tax cut or infrastructure plan.But without those additional steps, the president’s economic team predicts in a report released on Tuesday that growth would slow to about 2 percent a year in 2026. That is the year when many of the individual tax cuts included in the 2017 law are set to expire, essentially producing a tax increase for millions of Americans.'

In [48]:
text_words_new=[cleanlines(text_new,remove_stopwords=True)]


In [62]:
print(text_words_new[0])

['washington', 'trump', 'administration', 'pushed', 'trillion', 'tax', 'cut', 'congress', 'promise', 'spark', 'sustained', 'economic', 'growth', 'tax', 'cuts', 'goosed', 'economy', 'short', 'term', 'officials', 'concede', 'deliver', 'percent', 'annual', 'growth', 'president', 'promised', 'term', 'produce', 'average', 'growth', 'rate', 'decade', 'white', 'house', 'forecasters', 'american', 'economy', 'additional', 'rollbacks', 'labor', 'regulations', 'trillion', 'infrastructure', 'plan', 'round', 'tax', 'cuts', 'policies', 'implemented', 'highly', 'divided', 'congress', 'ballooning', 'federal', 'deficit', 'limit', 'lawmakers', 'appetite', 'spend', 'money', 'tax', 'cut', 'infrastructure', 'plan', 'additional', 'steps', 'president', 'economic', 'team', 'predicts', 'report', 'released', 'tuesday', 'growth', 'slow', 'percent', 'individual', 'tax', 'cuts', 'included', 'law', 'set', 'expire', 'essentially', 'producing', 'tax', 'increase', 'millions', 'americans']


In [50]:
X_new = get_pad_text(text_words_new,word_to_id,max_length=sequence_length)
print(X_new.shape)

(1, 300)


In [51]:
X_new

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [52]:
model.predict(X_new)

array([[0.02069229]], dtype=float32)

In [74]:
dir_2 = r'New_Data.csv'
test_set = pd.read_csv(dir_2)
test_set = test_set[['title','content','label']]
test_set.head()

Unnamed: 0,title,content,label
0,"Inspiring women take center stage in ''Women, ...","Inspiring women take center stage in ''Women, ...",1
1,Europe: Prey or Power?,"Imagine Europe as a target on a radar screen, ...",1
2,Netanyahu Cuts Short U.S. Trip After Gaza Rock...,A rocket launched from Gaza struck a house in ...,1
3,Scratching the Surface of Slavery,"Before ''Southern Promises'' begins, the actor...",1
4,Pick My Cotton': Video of Mock Whipping Prompt...,Four University of Georgia students were expel...,1


In [76]:
test_set['label'] = test_set['label'].astype(int)


In [80]:
test_set['content'] = test_set['content'].astype(str)
sents = test_set['content']
text_words=[cleanlines(sent,remove_stopwords=True) for sent in sents]
print(text_words[:5])



In [81]:
test_set['title'] = test_set['title'].astype(str)
sents = test_set['title']
title_words=[cleanlines(sent,remove_stopwords=True) for sent in sents]
print(title_words[:5])

[['inspiring', 'women', 'center', 'stage', 'women', 'war', 'peace', 'ii', 'intruder', 'upends', 'couple', 'life', 'salesman'], ['europe', 'prey', 'power'], ['netanyahu', 'cuts', 'short', 'trip', 'gaza', 'rocket', 'strikes', 'tel', 'aviv'], ['scratching', 'surface', 'slavery'], ['pick', 'cotton', 'video', 'mock', 'whipping', 'prompts', 'fraternity', 'expel', 'students']]


In [86]:
def concat_text(text_words,title_words):
    concat_text = []
    for i in range(len(text_words)):
        temp = text_words[i]+title_words[i]
        concat_text.append(temp)
    return concat_text
concat_text = concat_text(text_words,title_words)

In [87]:
X = get_pad_text(concat_text,word_to_id,max_length=sequence_length)

In [88]:
y = np.array(test_set['label'])


In [89]:
pred_y = model.predict(X)

In [92]:
def Chang_(pred):
    results = []
    for i in range(len(pred)):
        if pred[i]>0.5:
            temp = 1
            results.append(temp)
        else:
            temp = 0
            results.append(temp)
    return results
        
results = Chang_(pred_y)

In [93]:
con_matrix = confusion_matrix(results, y,labels=[0,1])  # 生成混淆矩阵
print(con_matrix)

[[203 225]
 [  0  11]]


In [96]:
print('precision:',precision_score(y, results))
print('recall:',recall_score(y, results))
print('f1:',f1_score(y, results))

precision: 1.0
recall: 0.046610169491525424
f1: 0.08906882591093117
