In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pickle
import nltk
nltk.download('punkt')
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import numpy as np
import re
from sklearn import feature_extraction
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import words

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/austinkrause/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
import os

In [8]:
contents = os.listdir()
print(contents)

['new_user_functions.ipynb', 'scratch_notebook.ipynb', '.DS_Store', 'summarizer_cosine_similarity.ipynb', 'Images', 'Text_Cleaning_and_KMeans.ipynb', 'df_with_gensim_summaries.csv', 'kmeans_df.csv', 'df_with_stems_final.csv', 'full_df_with_gensim_summaries.csv', 'Models', 'README.md', 'df_english_articles.csv', '.gitignore', 'DB_Cleaning_EDA.ipynb', 'xgboost_model.sav', '.ipynb_checkpoints', 'classification_models.ipynb', 'all-the-news.db', '.git', 'Data', 'Notebooks', 'df_with_lemmings.csv']


In [4]:
#load in our xgb classification model
xgb = pickle.load(open("xgboost_model.sav", "rb"))

In [9]:
def create_soup(url):
    facts = requests.get(url)
    soup = BeautifulSoup(facts.content, 'html.parser')
    return soup

In [124]:
def get_article_text(url):
    #url = input('Enter a url: \n')
    soup = create_soup(url)
    
    text = []
    for i in soup.find_all('p'):
        text.append(i.get_text())

    full_text = ' '.join(text).replace('\xa0', '')
    return full_text

In [12]:
#set stop words
stops = list(set(stopwords.words('english'))) + list(punctuation)

In [13]:
stemmer = PorterStemmer()

In [14]:
#helper function to remove stop words
def remove_stops(text):
    text_no_stops = []
    for i in text:
        i = i.lower()
        if i not in stops:
            if len(i) == 1:
                pass
            else:
                text_no_stops.append(i)
        else:
            pass
    return text_no_stops

In [15]:
#helper function to lemmatize text
def lemmatize_text(text):
    lemmatized = []
    for word in text:
        lemmatized.append(lemmatizer.lemmatize(word))
    return lemmatized

In [16]:
#create helper function to stem each word in a list and concat the list
def stem_list(lst):
    stemmed_list = []
    for i in lst:
        stemmed_list.append(stemmer.stem(i))
    stem_string = ' '.join(stemmed_list[:100])
    return stem_string

In [17]:
#function to perform lemmatization, stemming, and final preprocessing
def xgb_text_prep(sample_text):
    tokens = word_tokenize(str(sample_text), language = 'en')
    no_stops = remove_stops(tokens)
    stemmed = stem_list(no_stops)
    #stemmed = stemmed.replace('\\n', '')
    return [stemmed]
    #return no_stops

In [96]:
#function for xgboost prdiction
def get_xgb_prediction(text):
    preprocess = xgb_text_prep([text])
    return xgb.predict(preprocess)[0]

In [19]:
#find cosine similarities between sentences
def find_similarities(text):
    #tokenize sentences
    sentences = sent_tokenize(text, language = 'en')
    
    #vectorize sentences
    vectorizer = TfidfVectorizer(stop_words = stops)
    trsfm=vectorizer.fit_transform(sentences)
    #creat df for article
    text_df = pd.DataFrame(trsfm.toarray(),columns=vectorizer.get_feature_names(),index=sentences)
    
    #declare how many sentences to use in summary
    num_sentences = text_df.shape[0]
    num_summary_sentences = int(np.ceil(num_sentences**.5))
        
    #find cosine similarity for all sentences
    similarities = cosine_similarity(trsfm, trsfm)
    #print(similarities)
    #create list to hold avg cosine similarities for each sentence
    avgs = []
    for i in similarities:
        avgs.append(i.mean())
     
    #find index values of the sentences to be used for summary
    top_idx = np.argsort(avgs)[-num_summary_sentences:]
    
    return top_idx

In [77]:
def build_summary(text):
    sents_for_sum = find_similarities(text)
    sort = sorted(sents_for_sum)
    #print('\n Sentences selected:', sort)
    
    sent_list = sent_tokenize(text)
    num_sents = len(sent_list)
    
    sents = []
    for i in sort:
        sents.append(sent_list[i].replace('\n', ''))
    
    summary = ' '.join(sents)
    return sort, num_sents, summary

In [125]:
def xgb_and_summary(url):
    text = get_article_text(url)
    
    xgb_preprocess = xgb_text_prep(text)
    sents, num_sents, summary = build_summary(text)
    xgb_predict = get_xgb_prediction(text)
    
    return url, sents, num_sents, xgb_predict, summary

TEST ARTICLE 1 --- Supreme Court --- returns cluster 7
https://news.yahoo.com/u-supreme-court-spurns-virginia-143413283.html;_ylt=AwrC0COK7wddIj8AGCPQtDMD;_ylu=X3oDMTEydGhybm9tBGNvbG8DYmYxBHBvcwMxBHZ0aWQDQjc2MDlfMQRzZWMDc3I-

TEST ARTICLE 2 --- Donald Trump --- returns cluster 1
https://news.yahoo.com/trump-says-d-foreigners-offer-233911079.html;_ylt=AwrC1C7Y8AddADIAoS_QtDMD;_ylu=X3oDMTEyaTE5czNyBGNvbG8DYmYxBHBvcwM0BHZ0aWQDQjc2MDlfMQRzZWMDc3I-

TEST ARTICLE 3 --- Syria --- returns cluster 11
https://news.yahoo.com/turkish-outpost-syria-shelled-syrian-080926968.html;_ylt=AwrC0wwt8gdd2WgAkwHQtDMD;_ylu=X3oDMTEydGhybm9tBGNvbG8DYmYxBHBvcwMxBHZ0aWQDQjc2MDlfMQRzZWMDc3I-

TEST ARTICLE 4 --- Birth Control Health Insurance --- returns cluster 3
https://news.yahoo.com/democrat-warren-wants-7-billion-134454649.html;_ylt=AwrC1DGZ8gddwlsAJwLQtDMD;_ylu=X3oDMTEydDBqZzZmBGNvbG8DYmYxBHBvcwMyBHZ0aWQDQjc2MDlfMQRzZWMDc3I-

TEST ARTICLE 5 --- Bernie Sanders --- returns cluster 3
https://news.yahoo.com/sanders-outline-democratic-socialism-means-045302693.html;_ylt=AwrC0CMD4whdTSoALAHQtDMD;_ylu=X3oDMTEydDBqZzZmBGNvbG8DYmYxBHBvcwMyBHZ0aWQDQjc2MDlfMQRzZWMDc3I-

In [126]:
xgb_and_summary('https://www.nytimes.com/2019/06/17/books/review/kate-atkinson-jackson-brodie-big-sky.html')

('https://www.nytimes.com/2019/06/17/books/review/kate-atkinson-jackson-brodie-big-sky.html',
 [0, 2, 4, 14, 23, 32, 34],
 43,
 3,
 'Advertisement Supported by By Janet Maslin There’s a spoiled little girl in “Big Sky,” the long-overdue fifth book in Kate Atkinson’s irresistible Jackson Brodie private eye series, who has a closet full of princess costumes. Atkinson sneaks this into “Big Sky” so casually that it doesn’t resonate until exactly when the author wants it to. in all the Brodie books, but it’s never worked better than it does in “Big Sky.” That’s because Brodie’s past is by now very complicated. Atkinson has said that she loves Netflix, attributes the same feeling to Brodie, and has had a previous Brodie book, “Case Histories,” adapted for PBS. There is no stray anything in “Big Sky.” That’s one big reason Atkinson’s devotees love her. Brodie has been Atkinson’s most popular character for good reason. Atkinson opens “Big Sky” with one perfect page.')

In [144]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [32]:
def get_ny_articles(url):
    page = create_soup(url)
    #a= page.find_all('a', href = True)
    
    ny_times_links = []
    for a in page.find_all('a', href=True):
        ny_times_links.append(a['href'])
    
    ny_times_articles = []
    for i in ny_times_links:
        if i[0] == '/' and len(i)>2 and i.endswith('Container') == False:
            ny_times_articles.append(url+i)
    
    article_set = list(set(ny_times_articles))
    return article_set

In [142]:
def get_all_summaries(url):
    summaries = []
    for i in get_ny_articles(url):
        summaries.append(xgb_and_summary(i))
    
    for i in summaries:
        print('url:', i[0])
        print('sentences selected:', i[1])
        print('total sentences:', i[2])
        print('cluster prediction:', i[3])
        print('summary:', i[4])
        print('------------------------------------\n')
    #return summaries

In [157]:
get_all_summaries('https://www.nytimes.com')

url: https://www.nytimes.com/interactive/2019/us/politics/2020-candidate-interviews.html
sentences selected: [0, 1, 2, 4]
total sentences: 14
cluster prediction: 3
summary: JUNE 19, 2019 We tracked down the 2020 Democrats and asked them the same set of questions. We tracked down the 2020 Democrats and asked them the same set of questions. We tracked down the 2020 Democrats and asked them the same set of questions. Former housing secretary   Mayor of New York City   Former congressman from Maryland   Congresswoman from Hawaii   Senator from New York   Senator from California   Former governor of Colorado   Governor of Washington State   Senator from Minnesota   Congressman from Massachusetts   Former congressman from Texas   Congressman from Ohio   Senator from Vermont   Congressman from California   Senator from Massachusetts   Self-help author   Former tech executive  The New York Times reached out to 22 Democratic presidential candidates to ask them the same set of questions on video