In [3]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

In [5]:
# url definition
url = "https://www.nytimes.com/section/world"

# Request
r1 = requests.get(url)
r1.status_code

# We'll save in coverpage the cover page content
coverpage = r1.content

# Soup creation
soup1 = BeautifulSoup(coverpage, 'lxml') #parser

# News identification
coverpage_news = soup1.find_all('div', class_="css-4svvz1 ekkqrpp0")

In [6]:
ol = coverpage_news[0].find_all('ol', class_="css-11jjg ekkqrpp2")

In [7]:
# Empty lists for img-url, titles, Category and content 
img_lst = []
title_lst = []
content_lst = []

for n in [0, 2, 3]:
    # Getting img url
    try:
        img = ol[n].find_all('img')
        for i in img:
            img_lst.append(i['src'])
    except:
        img_lst.append('Null')
        
        
    # Extracting h2 tag
    if n == 0:
        tag = ol[n].find_all('h2', class_="css-l2vidh e4e4i5l1")
    else:
        try:
            tag = ol[n].find_all('h2', class_="css-y3otqb e134j7ei0")
        except:
            pass
        
    for i in tag:
        # Getting title
        title = i.find('a').get_text()
        title_lst.append(title)
        
        # Getting the link of the article
        link = "https://www.nytimes.com/" + i.find('a')['href']
        
        # Reading the content (it is divided in paragraphs)
        try:
            article = requests.get(link)
        except requests.exceptions.ConnectionError:
            requests.status_code = "Connection refused"
        article_content = article.content
        soup_article = BeautifulSoup(article_content, 'lxml')
        body = soup_article.find_all('section', class_='meteredContent css-1r7ky0e')
        try:
            x = body[0].find_all('p')
            # Unifying the paragraphs
            list_paragraphs = []
            for p in np.arange(0, len(x)):
                paragraph = x[p].get_text()
                list_paragraphs.append(paragraph)
                final_article = " ".join(list_paragraphs)

            # Removing special characters
            final_article = re.sub("\\xa0", "", final_article)
            content_lst.append(final_article)
        except IndexError:
            content_lst.append("Null")

In [8]:
# Cleaning repeated photos
for i in [1, 2, 3, 4]:
    img_lst.pop(i)

In [15]:
import nltk
import json
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [16]:
lst2 = []
for title in title_lst:
    # Define stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize and tag some text:
    words = word_tokenize(title)

    filtered_sentence = [w for w in words if not w in stop_words] # Delete extra words

    pos_tag = nltk.pos_tag(filtered_sentence)

    code = []
    for i in pos_tag:
        if i[1] == 'NNP' or i[1] == 'JJ' or i[1] == 'NNPS' or i[1] == 'VBP' or i[1] == 'NN':
            code.append(i[0])

    if 'COVID-19' in code:
        code.remove('COVID-19')

    # Identify named entities:
    nes = nltk.ne_chunk(pos_tag)

    name = []
    for ne in nes:
        if type(ne) is nltk.tree.Tree:
            if ne.label() in ['GPE', 'LOCATION']:
                name.append(u' '.join([i[0] for i in ne.leaves()]))

    lst = list(set(name + code))

    with open("countries.json") as f:
        countries = json.load(f)
        
    lst1 = []
    for i in lst:
        for country in countries:
            if re.search(country['name'], i) or re.search(country['code'], i):
                lst1.append(country['code'])
        else:
            lst1.append("Null")
    lst2.append(lst1)
                

In [17]:
location_lst = []
for lst in lst2:
    lst = list(set(lst))
    if len(lst) == 1:
        location_lst.append('World')
    else:
        for i in lst:
            if i != "Null":
                location_lst.append(i)
        

In [18]:
location_lst            

['World',
 'World',
 'World',
 'World',
 'NI',
 'World',
 'World',
 'UK',
 'CN',
 'NI',
 'World',
 'World',
 'UK',
 'CN']

In [20]:
import pickle
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [21]:
# SVM
with open('best_svc.pickle', 'rb') as data: #sklearn
    svc_model = pickle.load(data)



In [22]:
# TF-IDF object
with open('tfidf.pickle', 'rb') as data:
    tfidf = pickle.load(data)



In [23]:
# Category mapping dictionary
category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4,
    'other':5
}

In [24]:
# df_show_info
df_show_info = pd.DataFrame(
        {'Newspaper': 'The New York Times',
        'Title': title_lst,
        'Img': img_lst,
        'Content': content_lst,
        'Location': location_lst})

In [25]:
df_show_info

Unnamed: 0,Newspaper,Title,Img,Content,Location
0,The New York Times,‘Our Role Is to Reduce Their Grief’,https://static01.nyt.com/images/2020/07/19/wor...,THE DESERT WEST OF NAJAF — There are no signs ...,World
1,The New York Times,Coronavirus Live Updates: Trump Administration...,https://static01.nyt.com/images/2020/07/19/wor...,The Trump administration has balked at providi...,World
2,The New York Times,"As Seasonal Rains Fall, Dispute Over Nile Dam ...",https://static01.nyt.com/images/2020/07/16/us/...,"CAIRO — Every day now, seasonal rain pounds th...",World
3,The New York Times,London Police Urged to Apologize After Officer...,https://static01.nyt.com/images/2020/07/16/us/...,LONDON — The lawyer for a Black man who repeat...,World
4,The New York Times,Nicaragua’s Ruling Sandinistas Fall Victim to ...,https://static01.nyt.com/images/2020/07/19/wor...,A string of recent deaths across Nicaragua — i...,NI
5,The New York Times,"Defying Kremlin, Protesters Stage Biggest Rall...",https://static01.nyt.com/images/2020/07/19/wor...,MOSCOW — Ignoring pleas from the Kremlin for c...,World
6,The New York Times,Fire Hits Cathedral in French City of Nantes,https://static01.nyt.com/images/2020/07/18/wor...,A fire broke out inside the cathedral of the w...,World
7,The New York Times,Long Waits for U.K. Hospital Treatment as N.H....,https://static01.nyt.com/images/2020/07/18/wor...,LONDON — After nine months of waiting for surg...,UK
8,The New York Times,China’s Swimwear Capital Can’t Wait for You to...,https://static01.nyt.com/images/2020/07/19/wor...,There may be no place on earth that had been l...,CN
9,The New York Times,Nicaragua’s Ruling Sandinistas Fall Victim to ...,https://static01.nyt.com/images/2020/07/19/wor...,A string of recent deaths across Nicaragua — i...,NI


In [26]:
#Feature Engineering Functions

punctuation_signs = list("?:!.,;")
stop_words = list(stopwords.words('english'))

def create_features_from_df(df):
    
    df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')
    
    df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()
    
    df['Content_Parsed_3'] = df['Content_Parsed_2']
    for punct_sign in punctuation_signs:
        df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')
        
    df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")
    
    wordnet_lemmatizer = WordNetLemmatizer()
    nrows = len(df)
    lemmatized_text_list = []
    for row in range(0, nrows):

        # Create an empty list containing lemmatized words
        lemmatized_list = []
        # Save the text and its words into an object
        text = df.loc[row]['Content_Parsed_4']
        text_words = text.split(" ")
        # Iterate through every word to lemmatize
        for word in text_words:
            lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        # Join the list
        lemmatized_text = " ".join(lemmatized_list)
        # Append to the list containing the texts
        lemmatized_text_list.append(lemmatized_text)
    
    df['Content_Parsed_5'] = lemmatized_text_list
    
    df['Content_Parsed_6'] = df['Content_Parsed_5']
    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')
        
    df = df['Content_Parsed_6']
    #df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'}, inplace = True)
    
    # TF-IDF
    features = tfidf.transform(df).toarray()
    features = features.reshape(-1, 1)
    
    return features

In [27]:
def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

In [44]:
# Prediction Functions

def predict_from_features(features):
        
    # Obtain the highest probability of the predictions for each article
    svc = SVC(probability=True)
    X_train, y_train = train_test_split(features,
                                        test_size=0.25,)
    svc.fit(X_train, y_train.values.ravel())
    predictions_proba = svc.predict_proba(features).max(axis=1)
    #predictions_proba = svc_model.predict_proba(features).max(axis=1)    
    
    # Predict using the input model
    predictions_pre = svc_model.predict(features)

    # Replace prediction with 6 if associated cond. probability less than threshold
    predictions = []

    for prob, cat in zip(predictions_proba, predictions_pre):
        if prob > .65:
            predictions.append(cat)
        else:
            predictions.append(5)

    # Return result
    categories = [get_category_name(x) for x in predictions]
    
    return categories

In [45]:
def complete_df(df, categories):
    df['Prediction'] = categories
    return df

In [48]:
# Create features
features = create_features_from_df(df_show_info)

# Predict
predictions = predict_from_features(features)

# Put into dataset
df = complete_df(df_show_info['Category'], predictions)

In [None]:
df.to_json('nytimes19.json')