In [None]:
from googleapiclient.discovery import build
from bs4 import BeautifulSoup
from gensim import corpora, models
from itertools import product, count

import time, random, math
import requests
import nltk

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def get_urls():
    api_key = input('API Key:')
    calls = int(input('Number of calls:'))
    search_query = input('Search query:')
    cse_id = input('Google CSE ID:')
    urls = set()
    start_ = 1
    
    service = build('customsearch', 'v1', developerKey = api_key).cse()
    
    for i in count(1):
        if i > calls:
            break
            
        result = service.list(q=search_query, cx=cse_id, start=start_).execute()
        for item in result['items']:
            urls.add(item['link'])
            
        start_ += 10
        time.sleep(random.uniform(1,2))
        
    return urls
        

In [None]:
def get_article_text(links):
    article_tag, article_class = input('Enter article html tag and class separated by a space:').split()
    date_tag, date_class = input('Enter date html tag and class separated by a space:').split()
    
    directory_name = input('Enter folder name. Folder must be in the same directory as py script:')
    
    for line in links:
        url_text = requests.get(line).text
        soup = BeautifulSoup(url_text, 'lxml')
        
        article_text = soup.find(article_tag, class_ = article_class).text
        date_published = soup.find(date_tag, class_ = date_class).text
        
        
        with open(f'./{directory_name}/{date_published}.txt', 'w') as f:
            f.write(article_text)
            
        time.sleep(random.uniform(1,2))

In [None]:
def preprocessing(keyword):
     
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    stopwords = set(nltk.corpus.stopwords.words('english'))
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    
    path = input('Enter path to text files:')
    filelist = os.listdir(path)
    
    
    texts = []
    for file in filelist:
    with open(f'{path}/{file}') as f:
        content = f.read()
        if keyword.lower() in content.lower():
            texts.append(content)
    
    tokens = [tokenizer.tokenize(sentence) for sentence in text]

    stopwords_removed = [[word for word in token if word.lower() not in stopwords] for token in tokens]

    lemmatized_text = [[lemmatizer.lemmatize(word) for word in words] for words in stopwords_removed]
    
    return lemmatized_text

In [None]:
def compute_cv(lemmatized_text):
    coherence_values = []
    model_list = []
    
    dictionary = corpora.Dictionary(lemmatized_text)
    corpus = [dictionary.doc2bow(text) for text in lemmatized_text]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    topics = [i for i in range(start, limit, step)]
    
    for topic in topics:

        lda_model = models.LdaModel(corpus_tfidf,
                                   id2word=dictionary,
                                   num_topics=topic)


        coherence_model = models.CoherenceModel(model=lda_model,
                                               texts=lemmatized_text,
                                               dictionary=dictionary,
                                               coherence='c_v')

        model_list.append(lda_model)
        coherence_values.append(coherence_model.get_coherence())
        
        plt.plot(topics, coherence_values)
        
    return model_list, coherence_values

In [None]:
def get_topics():
    
    optimal_topic_no = int(input('Enter your identified optimal topic number:'))
    optimal model = model_list[topics.index(optimal_topic_no)]
    
    topics = [topic for index, topic in optimal_model.print_topics()]
    
    pattern = re.compile(r'[a-zA-Z]+')
    topics_text = pattern.findall(' '.join(topics))
    
    with open('./topics.txt', 'w') as f:
        topics_text = ', '.join(topics_text)
        f.write(topics_text)
        
    print(topics_text)
    
    return topics_text

In [None]:
if __name__ == '__main__':
    links = get_urls()
    proceed = input("Enter 'y' to proceed with webscraping:")
    if proceed.lower() == 'y': 
        get_article_text(links)
    
    proceed = input('Proceed with topic modelling? (y/n):')
    if proceed.lower() == 'y': 
        lemmatized_text = preprocessing(keyword) # A keyword you expect all relevant files to contain 
        model_list, coherence_values = compute_cv(lemmatized_text)
        
    proceed = input('Proceed and get topics? (y/n):')
    if proceed.lower() == 'y': 
        derived_topics = get_topics()