### Imports

In [1]:
# see https://ipython.readthedocs.io/en/stable/interactive/magics.html
%pylab inline

# sets backend to render higher res images
%config InlineBackend.figure_formats = ['retina']


# uses custom metis style sheet for notebooks
from IPython.core.display import HTML
HTML("""<link rel="stylesheet" href="https://soph.info/metis/nb.css" type="text/css"/>""")

#######################
#       imports       #
#######################
import pandas as pd
import seaborn as sns

# import sklearn
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle

Populating the interactive namespace from numpy and matplotlib


In [126]:
# gensim
import gensim

# keras
from keras.models import Sequential
from keras.layers import (Dense, Embedding, Reshape, Activation, 
                          SimpleRNN, LSTM, Convolution1D, 
                          MaxPooling1D, Dropout, Bidirectional)
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.datasets import imdb, reuters
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop


# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Using TensorFlow backend.


In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Cleaning df1

In [64]:
df1 = pd.read_csv('NewsArticles.csv',encoding='latin1')

In [65]:
df1 = df1[['article_id', 'publish_date', 'article_source_link', 'title',
       'subtitle', 'text']]

In [36]:
for row in df1['article_source_link']:
    df1['publication'] = row.split('/')[2].split('.')[0]

In [37]:
df1.head()

Unnamed: 0,article_id,publish_date,article_source_link,title,subtitle,text,publication
0,1,2017/2/7,http://abcnews.go.com/Politics/pence-break-tie...,"Betsy DeVos Confirmed as Education Secretary, ...",,Michigan billionaire education activist Betsy ...,www
1,2,2017/2/7,http://abcnews.go.com/Politics/wireStory/melan...,Melania Trump Says White House Could Mean Mill...,,First lady Melania Trump has said little about...,www
2,3,2017/2/7,http://abcnews.go.com/Politics/wireStory/trump...,"As Trump Fears Fraud, GOP Eliminates Election ...",,A House committee voted on Tuesday to eliminat...,www
3,4,2017/2/7,http://abcnews.go.com/Politics/appeals-court-d...,Appeals Court to Decide on Challenge to Trump'...,,"This afternoon, three federal judges from the ...",www
4,5,2017/2/7,http://abcnews.go.com/US/23-states-winter-weat...,At Least 4 Tornadoes Reported in Southeast Lou...,,At least four tornadoes touched down in Louisi...,www


In [38]:
df1['article_source_link'][1].split('/')[2].split('.')[0]

'abcnews'

In [66]:
df1['pub'] = df1['article_source_link'].str.split('//').str[1]

In [67]:
df1['pub2'] = df1['pub'].str.split('.').str[0]

In [69]:
df1.drop(['pub'],axis=1,inplace=True)

In [70]:
df1.rename({'pub2':'publication','publish_date':'date','article_source_link':'url','text':'content'},axis=1,inplace=True)

In [71]:
df1.drop(['article_id'],axis=1,inplace=True)

In [72]:
df1['title1'] = df1[['title','subtitle']].astype(str).apply(lambda x:' '.join(x),axis=1)

In [74]:
df1.drop(['title','subtitle'],axis=1,inplace=True)

In [75]:
df1.rename({'title1':'title'},axis=1,inplace=True)

In [76]:
df1.head()

Unnamed: 0,date,url,content,publication,title
0,2017/2/7,http://abcnews.go.com/Politics/pence-break-tie...,Michigan billionaire education activist Betsy ...,abcnews,"Betsy DeVos Confirmed as Education Secretary, ..."
1,2017/2/7,http://abcnews.go.com/Politics/wireStory/melan...,First lady Melania Trump has said little about...,abcnews,Melania Trump Says White House Could Mean Mill...
2,2017/2/7,http://abcnews.go.com/Politics/wireStory/trump...,A House committee voted on Tuesday to eliminat...,abcnews,"As Trump Fears Fraud, GOP Eliminates Election ..."
3,2017/2/7,http://abcnews.go.com/Politics/appeals-court-d...,"This afternoon, three federal judges from the ...",abcnews,Appeals Court to Decide on Challenge to Trump'...
4,2017/2/7,http://abcnews.go.com/US/23-states-winter-weat...,At least four tornadoes touched down in Louisi...,abcnews,At Least 4 Tornadoes Reported in Southeast Lou...


### Cleaning the rest if dfs

In [11]:
df2 = pd.read_csv('articles1.csv')

In [77]:
df3 = pd.read_csv('articles2.csv')

In [78]:
df4 = pd.read_csv('articles3.csv')

In [79]:
df_pre = pd.concat([df2,df3,df4],axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [80]:
df_pre.head()

Unnamed: 0.1,Unnamed: 0,author,content,date,id,month,publication,title,url,year
0,,,WASHINGTON — Congressional Republicans have...,2016-12-31,,,New York Times,House Republicans Fret About Winning Their Hea...,,
1,,,"After the bullet shells get counted, the blood...",2017-06-19,,,New York Times,Rift Between Officers and Residents as Killing...,,
2,,,"When Walt Disney’s “Bambi” opened in 1942, cri...",2017-01-06,,,New York Times,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",,
3,,,"Death may be the great equalizer, but it isn’t...",2017-04-10,,,New York Times,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",,
4,,,"SEOUL, South Korea — North Korea’s leader, ...",2017-01-02,,,New York Times,Kim Jong-un Says North Korea Is Preparing to T...,,


In [81]:
df_pre.drop(['Unnamed: 0', 'id','author','year', 'month'],axis=1,inplace=True)

In [82]:
df_pre.head()

Unnamed: 0,content,date,publication,title,url
0,WASHINGTON — Congressional Republicans have...,2016-12-31,New York Times,House Republicans Fret About Winning Their Hea...,
1,"After the bullet shells get counted, the blood...",2017-06-19,New York Times,Rift Between Officers and Residents as Killing...,
2,"When Walt Disney’s “Bambi” opened in 1942, cri...",2017-01-06,New York Times,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",
3,"Death may be the great equalizer, but it isn’t...",2017-04-10,New York Times,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",
4,"SEOUL, South Korea — North Korea’s leader, ...",2017-01-02,New York Times,Kim Jong-un Says North Korea Is Preparing to T...,


### Full df

In [83]:
df = pd.concat([df1,df_pre],axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146394 entries, 0 to 42570
Data columns (total 5 columns):
content        146361 non-null object
date           143753 non-null object
publication    146394 non-null object
title          146392 non-null object
url            89383 non-null object
dtypes: object(5)
memory usage: 6.7+ MB


In [96]:
df['title+content'] = df[['title','content']].astype(str).apply(lambda x:' '.join(x),axis=1)

In [90]:
df['publication'].value_counts()

Breitbart              23781
New York Post          17493
CNN                    13993
NPR                    11992
Washington Post        11114
Reuters                10710
Guardian                8681
New York Times          7803
Atlantic                7179
Business Insider        6757
National Review         6203
Talking Points Memo     5214
Vox                     4947
Buzzfeed News           4854
Fox News                4354
tass                     485
abcnews                  474
europe                   360
Name: publication, dtype: int64

In [89]:
df['publication'] = df['publication'].replace({'www':'CNN'})

In [100]:
df[['content','publication','title+content']] = df[['content','publication','title+content']].astype('str')

In [115]:
df['content'] = df['content'].astype('str')

In [121]:
df['title'] = df['title'].astype('str')

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146394 entries, 0 to 42570
Data columns (total 6 columns):
content          146394 non-null object
date             143753 non-null object
publication      146394 non-null object
title            146392 non-null object
url              146394 non-null object
title+content    146394 non-null object
dtypes: object(6)
memory usage: 7.8+ MB


In [105]:
df['url'] = df['url'].fillna('No url')

### Processing text data 

In [122]:
import re 
import string 

In [123]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [118]:
df['content'] = df['content'].map(alphanumeric).map(punc_lower)

In [124]:
df['title'] = df['title'].map(alphanumeric).map(punc_lower)

In [125]:
df['title+content'] = df['title+content'].map(alphanumeric).map(punc_lower)

### LSTM Modeling 

In [133]:
from keras.preprocessing.sequence import pad_sequences

In [136]:
from sklearn.model_selection import train_test_split

In [147]:
from keras.layers import SpatialDropout1D
from keras.callbacks import EarlyStopping


In [188]:
# The maximum number of words to be used. (most frequent)
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
MAX_NB_WORDS = 20000


In [189]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS, filters="""!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',""")
tokenizer.fit_on_texts(df['title+content'].values)
word_index = tokenizer.word_index

In [190]:
# Convert tokenized sentences to sequence format
X = tokenizer.texts_to_sequences(df['title+content'])
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

In [191]:
Y = pd.get_dummies(df['publication']).values

In [192]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(117115, 250) (117115, 18)
(29279, 250) (29279, 18)


In [196]:
model = Sequential()
model.add(Embedding(input_dim=MAX_NB_WORDS, output_dim=12, embeddings_initializer='glorot_uniform', input_length=MAX_SEQUENCE_LENGTH))
model.add(SpatialDropout1D(0.2))

model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(18, activation='softmax'))
model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 250, 12)           240000    
_________________________________________________________________
spatial_dropout1d_13 (Spatia (None, 250, 12)           0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 100)               45200     
_________________________________________________________________
dense_13 (Dense)             (None, 18)                1818      
Total params: 287,018
Trainable params: 287,018
Non-trainable params: 0
_________________________________________________________________


In [197]:
epochs = 1
batch_size = 64


model.fit(X_train, Y_train, batch_size=256, epochs=epochs, 
              validation_data=(X_test, Y_test),callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Train on 117115 samples, validate on 29279 samples
Epoch 1/1


<keras.callbacks.History at 0x1a60900450>