In [1]:
import os
import pandas as pd
import numpy as np
from string import digits
import requests
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Attention
import spacy
import gensim
import pprint

In [2]:
classes = os.listdir('./BBC News Summary/News Articles')
Articles_dir = './BBC News Summary/News Articles/'
Summaries_dir = './BBC News Summary/Summaries/'

articles = []
summaries = []
file_arr = []
for cls in classes:
    files = os.listdir(Articles_dir + cls)
    for file in files:
        article_file_path = Articles_dir + cls + '/' + file
        summary_file_path = Summaries_dir + cls + '/' + file
        try:
            with open(article_file_path, 'r') as f:
                articles.append('.'.join(
                    [line.rstrip() for line in f.readlines()]))
            with open(summary_file_path, 'r') as f:
                summaries.append('.'.join(
                    [line.rstrip() for line in f.readlines()]))
            file_arr.append(cls + '/' + file)
        except:
            pass

data = pd.DataFrame({
    'File_path': file_arr,
    'Articles': articles,
    'Summaries': summaries
})
data.head()

Unnamed: 0,File_path,Articles,Summaries
0,business/001.txt,Ad sales boost Time Warner profit..Quarterly p...,TimeWarner said fourth quarter sales rose 2% t...
1,business/002.txt,Dollar gains on Greenspan speech..The dollar h...,The dollar has hit its highest level against t...
2,business/003.txt,Yukos unit buyer faces loan claim..The owners ...,Yukos' owner Menatep Group says it will ask Ro...
3,business/004.txt,High fuel prices hit BA's profits..British Air...,"Rod Eddington, BA's chief executive, said the ..."
4,business/005.txt,Pernod takeover talk lifts Domecq..Shares in U...,Pernod has reduced the debt it took on to fund...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   File_path  2225 non-null   object
 1   Articles   2225 non-null   object
 2   Summaries  2225 non-null   object
dtypes: object(3)
memory usage: 52.3+ KB


In [4]:
data.describe()

Unnamed: 0,File_path,Articles,Summaries
count,2225,2225,2225
unique,2225,2127,2081
top,business/001.txt,Ray DVD beats box office takings..Oscar-nomina...,Although the two partially-paralysed people pe...
freq,1,2,2


In [5]:
# data.to_csv('news.csv')

In [6]:
data.shape

(2225, 3)

In [7]:
contractions_dictionary = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "this's": "this is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}


In [8]:
def Filter(text):
    text = text.lower()
    text = ' '.join([
        contractions_dictionary[i]
        if i in contractions_dictionary.keys() else i for i in text.split()
    ])
    text = re.sub(r'\(.*\)', "", text)
    text = re.sub("'s", "", text)
    text = re.sub('"', '', text)
    text = ' '.join([i for i in text.split() if i.isalpha()])
    text = re.sub('[^a-zA-Z]', " ", text)

    return text


data['File_path'] = data['File_path'].apply(Filter)
data['Articles'] = data['Articles'].apply(Filter)
data['Summaries'] = data['Summaries'].apply(Filter)

In [9]:
# data = data.drop(['File_path'], axis=1)
data

Unnamed: 0,File_path,Articles,Summaries
0,,ad sales boost time warner profits at us media...,timewarner said fourth quarter sales rose to f...
1,,dollar gains on greenspan dollar has hit its h...,the dollar has hit its highest level against t...
2,,yukos unit buyer faces loan owners of embattle...,owner menatep group says it will ask rosneft t...
3,,high fuel prices hit ba airways has blamed hig...,rod ba chief said the results were respectable...
4,,pernod takeover talk lifts in uk drinks and fo...,pernod has reduced the debt it took on to fund...
...,...,...,...
2220,,bt program to beat dialler is introducing two ...,bt is introducing two initiatives to help beat...
2221,,spam tempt net users across the world continue...,a third of them read unsolicited junk and buy ...
2222,,be careful how you new european directive coul...,this goes to the heart of the european and eve...
2223,,us cyber security chief man making sure us com...,amit yoran was director of the national cyber ...


In [10]:
# Remove HTML tags from the data frame if they are present


def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


data['Summary'] = data['Summaries'].apply(strip_html)
data['Article'] = data['Articles'].apply(strip_html)

data = data.drop(['Summaries', 'Articles'], axis=1)


In [11]:
data

Unnamed: 0,File_path,Summary,Article
0,,timewarner said fourth quarter sales rose to f...,ad sales boost time warner profits at us media...
1,,the dollar has hit its highest level against t...,dollar gains on greenspan dollar has hit its h...
2,,owner menatep group says it will ask rosneft t...,yukos unit buyer faces loan owners of embattle...
3,,rod ba chief said the results were respectable...,high fuel prices hit ba airways has blamed hig...
4,,pernod has reduced the debt it took on to fund...,pernod takeover talk lifts in uk drinks and fo...
...,...,...,...
2220,,bt is introducing two initiatives to help beat...,bt program to beat dialler is introducing two ...
2221,,a third of them read unsolicited junk and buy ...,spam tempt net users across the world continue...
2222,,this goes to the heart of the european and eve...,be careful how you new european directive coul...
2223,,amit yoran was director of the national cyber ...,us cyber security chief man making sure us com...


In [12]:
# Remove Stop Words from the Summary


def remove_stopword(text):
    stopword = nltk.corpus.stopwords.words('english')
    stopword.remove('not')
    a = [w for w in nltk.word_tokenize(text) if w not in stopword]
    return ' '.join(a)


data['Summary'] = data['Summary'].apply(remove_stopword)
data['Article'] = data['Article'].apply(remove_stopword)

In [13]:
def punc_clean(text):
    import string as st
    a = [w for w in text if w not in st.punctuation]
    return ''.join(a)


data['Summary'] = data['Summary'].apply(punc_clean)
data['Article'] = data['Article'].apply(punc_clean)

In [14]:
data = data.Article.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [15]:
def sent_to_words(sentences, deacc=True): 
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
data_words = list(sent_to_words(data))     

print(data_words[:1])

[['ad', 'sales', 'boost', 'time', 'warner', 'profits', 'us', 'media', 'giant', 'timewarner', 'jumped', 'close', 'warner', 'fourth', 'quarter', 'profits', 'slightly', 'better', 'film', 'division', 'saw', 'profits', 'slump', 'helped', 'flops', 'alexander', 'sharp', 'contrast', 'third', 'final', 'film', 'lord', 'rings', 'trilogy', 'boosted', 'timewarner', 'posted', 'profit', 'revenues', 'grew', 'financial', 'performance', 'meeting', 'exceeding', 'objectives', 'greatly', 'enhancing', 'chairman', 'chief', 'executive', 'richard', 'parsons', 'timewarner', 'projecting', 'operating', 'earnings', 'growth', 'around', 'also', 'expects', 'higher', 'revenue', 'wider', 'profit', 'restate', 'accounts', 'part', 'efforts', 'resolve', 'inquiry', 'aol', 'us', 'market', 'already', 'offered', 'pay', 'settle', 'deal', 'review', 'company', 'said', 'unable', 'estimate', 'amount', 'needed', 'set', 'aside', 'legal', 'previously', 'set', 'intends', 'adjust', 'way', 'accounts', 'deal', 'german', 'music', 'publishe

In [16]:
#lemmatize
def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    texts_out=[]
    for sent in texts:
        doc=nlp(' '.join(sent))
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

['ad sale boost time profit medium giant timewarner jump close warner fourth quarter profit slightly well film division see profit slump help flop alexander sharp contrast third final film trilogy boost timewarner post profit revenue grow financial performance meeting exceed objective greatly enhance chairman chief executive timewarner project operate earning growth around also expect high revenue wide profit restate account part effort resolve inquiry market already offer pay settle deal review company say unable estimate amount need set aside legal previously set intend adjust way account deal german music publisher bertelsmann purchase stake report advertising book sale stake loss value', 'dollar gain dollar hit high level euro almost month say trade deficit set highlight government willingness curb spend rise household saving factor help reduce late trading new dollar reach market concern deficit hit greenback recent ahead meeting finance minister send dollar higher early tumble ba

In [17]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum read occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [18]:
lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every=-1,       # compute perplexity every n iters, default: Don't
                                      n_jobs=-1,               # Use all available CPUs
                                      learning_decay=0.9
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model) 

LatentDirichletAllocation(learning_decay=0.9, learning_method='online',
                          n_components=5, n_jobs=-1, random_state=100)


In [19]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

Log Likelihood:  -1580944.9147908678
Perplexity:  1158.9869047231173


In [20]:
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.57,0.29,0.09,0.0,0.05,0
Doc1,0.0,0.0,0.0,0.99,0.0,3
Doc2,0.0,0.0,0.29,0.08,0.63,4
Doc3,0.5,0.02,0.08,0.4,0.0,0
Doc4,0.0,0.51,0.13,0.35,0.0,1
Doc5,0.81,0.04,0.0,0.15,0.0,0
Doc6,0.66,0.0,0.0,0.33,0.0,0
Doc7,0.0,0.0,0.59,0.4,0.0,2
Doc8,0.56,0.09,0.35,0.0,0.0,0
Doc9,0.09,0.0,0.75,0.1,0.06,2


In [22]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

NameError: name 'pyLDAvis' is not defined