In [2]:

import pandas as pd
import re
import json
import time
import numpy as np
import spacy
import requests
from bs4 import BeautifulSoup

In [3]:
article_df = pd.read_csv('data/complete_article_data.csv')
article_df.head()

Unnamed: 0,url,title,outlet,bias,date,content
0,https://www.bbc.com/arabic/articles/c4g3rqvpnq...,Trump signs order to impose tariffs on goods i...,BBC News,center,2025-02-01,صدر الصورة، Getty Images أعلنت كندا فرض رسوم ج...
1,https://www.thehindu.com/news/international/tr...,Trump signs executive order to impose tariffs ...,The Hindu,leanLeft,2025-02-01,"February 2, 2025e-Paper \n\t\t\t\t\t\t\t\t\tTh..."
2,https://www.news18.com/world/us-imposes-tariff...,"Canada Imposes 25% Tariffs Against US; Mexico,...",News18 India,leanRight,2025-02-01,Canada imposed 25 per cent tariffs on $155 bil...
3,https://apnews.com/article/trump-tariffs-mexic...,What do Trump's executive orders say on tariff...,Associated Press News,leanLeft,2025-02-01,Copyright 2025 The Associated Press. All Right...
4,https://www.ndtv.com/world-news/donald-trump-i...,Explained: What Are Tariffs Imposed By Donald ...,NDTV,leanRight,2025-02-01,US President Donald Trump signed an order impo...


In [4]:
article_df['content'].values[1].strip()
article_df['content'] = article_df['content'].apply(lambda x: x.strip())

article_df.head()

Unnamed: 0,url,title,outlet,bias,date,content
0,https://www.bbc.com/arabic/articles/c4g3rqvpnq...,Trump signs order to impose tariffs on goods i...,BBC News,center,2025-02-01,صدر الصورة، Getty Images أعلنت كندا فرض رسوم ج...
1,https://www.thehindu.com/news/international/tr...,Trump signs executive order to impose tariffs ...,The Hindu,leanLeft,2025-02-01,"February 2, 2025e-Paper \n\t\t\t\t\t\t\t\t\tTh..."
2,https://www.news18.com/world/us-imposes-tariff...,"Canada Imposes 25% Tariffs Against US; Mexico,...",News18 India,leanRight,2025-02-01,Canada imposed 25 per cent tariffs on $155 bil...
3,https://apnews.com/article/trump-tariffs-mexic...,What do Trump's executive orders say on tariff...,Associated Press News,leanLeft,2025-02-01,Copyright 2025 The Associated Press. All Right...
4,https://www.ndtv.com/world-news/donald-trump-i...,Explained: What Are Tariffs Imposed By Donald ...,NDTV,leanRight,2025-02-01,US President Donald Trump signed an order impo...


In [29]:
samp_txt = article_df['content'][1]
nlp = spacy.load("en_core_web_sm")

In [32]:

def filter_text(title: str, text: str):
    nlp1 = spacy.load("en_core_web_sm")
    nlp2 = spacy.load("en_core_web_sm")
    text = re.sub(r'\n\s*\n', '\n', text)  # Remove excess blank lines
    text = re.sub(r'\t\s*\t', '\t', text)
    text = text.encode("utf-8", errors='ignore').decode("utf-8") #Replacing unicode characters
    text = text.strip()
    doc: spacy.tokens.doc.Doc = nlp1(text)
    title = nlp2(title)
    first_subject = False
    relevant_lines = []
    article_flags = []

    #Making a list of relevant subjects/nouns in the title
    for word in title:
        if word.pos_ == 'PROPN':
            article_flags.append(word)
        elif word.dep_ in ['nsubj', 'ROOT']:
            article_flags.append(word)
        elif word.ent_id_ in ['PERSON', 'ORG', 'GPE']:
            article_flags.append(word)
    article_flags = [str(x).lower() for x in article_flags]

    #Keeping all sentences after first appearance of relevant word
    doc_sents = [x for x in doc.sents]
    #print(doc_sents)
    doc_reverse = list(reversed(doc_sents))
    for sentence in doc_sents:
        if not first_subject:
            for word in sentence:
                word = str(word).lower()
                if word in article_flags:
                    first_subject = True
                    relevant_lines.append(str(sentence))
        else:
            relevant_lines.append(str(sentence))
    
    #Removing all ending text after the last mention of a relevant entity
    last_subject=False
    for ind, sentence in enumerate(doc_reverse):
        if not last_subject:
            for word in sentence:
                word = str(word).lower()
                if word in article_flags:
                    last_subject = True
                    last_subj_ind = ind

    relevant_lines = relevant_lines[:-last_subj_ind]
    seen_lines = []
    for sentence in relevant_lines:
        if sentence in seen_lines:
            continue
        seen_lines.append(sentence.strip())
    #print(seen_lines, relevant_lines)

    return ' '.join(seen_lines)


def filter_text_2(text: str, nlp):
    text = re.sub(r'\n\s*\n', '\n', text)  # Remove excess blank lines
    text = re.sub(r'\t\s*\t', '\t', text)
    text = text.encode("utf-8", errors='ignore').decode("utf-8") #Replacing unicode characters
    text = text.strip()
    doc: spacy.tokens.doc.Doc = nlp(text)
    first_subject = False
    relevant_lines = []
    article_flags = {}
    doc_sents = [x for x in doc.sents]

    #Making a list of relevant subjects/nouns in the title
    for sent in doc_sents:
        for word in sent:
            w_str = str(word).lower()
            if word.pos_ == 'PROPN':
                if w_str not in article_flags:
                    article_flags[w_str] = 0
                article_flags[w_str] += 1
            elif word.dep_ in ['nsubj', 'ROOT']:
                if w_str not in article_flags:
                    article_flags[w_str] = 0
                article_flags[w_str] += 1
            elif word.ent_id_ in ['PERSON', 'ORG', 'GPE']:
                if w_str not in article_flags:
                    article_flags[w_str] = 0
                article_flags[w_str] += 1
    common_flag_vals = sorted(list(article_flags.values()), reverse=True)[:3]
    common_flags = [x for x,y in article_flags.items() if y in common_flag_vals]
    print(common_flags)
    #Keeping all sentences after first appearance of relevant word
    #print(doc_sents)
    doc_reverse = list(reversed(doc_sents))
    for sentence in doc_sents:
        if not first_subject:
            for word in sentence:
                word = str(word).lower()
                if word in common_flags:
                    first_subject = True
                    relevant_lines.append(str(sentence))
        else:
            relevant_lines.append(str(sentence))
    
    #Removing all ending text after the last mention of a relevant entity
    last_subject=False
    for ind, sentence in enumerate(doc_reverse):
        if not last_subject:
            for word in sentence:
                word = str(word).lower()
                if word in common_flags:
                    last_subject = True
                    last_subj_ind = ind

    relevant_lines = relevant_lines[:-last_subj_ind]
    seen_lines = []
    for sentence in relevant_lines:
        if sentence in seen_lines:
            continue
        seen_lines.append(sentence.strip())
    full_text = ' '.join(seen_lines)
    return full_text

    #Further processing 
    pattern = r"\n"
    full_text = re.sub(pattern, ".", full_text)
    pattern2 = r'\s{2,}'
    full_text = re.sub(pattern2, "", full_text)

    #removing sentences that have between 2 and 4 words
    sents = full_text.split(".")
    sents_cleaned = [x for x in sents if not 4 >= len(x.split()) >1]    

    return 

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/105.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",
}

proxy_list = []
with open("proxies.txt", "r") as f:
    proxies = f.read().split('\n')
    for prox in proxies:
        proxy_list.append(prox)



In [33]:
def scrape_url(url, headers, proxy):
    try:
        response = requests.get(url, timeout=2.5, headers=headers)
        print(response.status_code)
        if response.status_code != 200:
            url = f"https://web.archive.org/web/{url}"
            response = requests.get(url, timeout=2.5, headers=headers, proxies=proxies)
            if response.status_code != 200:
                print('error')
                return 'Error'
    except:
        return 'Error'
    content = BeautifulSoup(response.text, "html.parser")
    all_p = content.find_all('p')
    clean_text = [x.text for x in all_p]
    full_text = ' '.join(clean_text)
    return full_text

url = 'https://www.theepochtimes.com/us/white-house-official-says-more-than-15-countries-have-made-trade-deal-offers-5839767'
pattern = r"\s"
url = re.sub(pattern, "", url)
url_text = scrape_url(url, headers, proxy=proxy_list[25])
filter_text_2(url_text, nlp)


200
['said', 'trump', 'president', 'u.s.', 'we']


''

In [10]:

def get_proxies():
    proxy_list = []
    working_proxies = []
    with open("proxies_unfiltered.txt", "r") as f:
        proxies = f.read().split('\n')
        for prox in proxies:
            proxy_list.append(prox)

    for proxy in proxy_list:
        proxy_dict = {"http": proxy, "https": proxy}
        try:
            resp = requests.get(url, headers=headers, proxies=proxy_dict, timeout=2.5).status_code
        except:
            continue
        if resp == 200:
            print(proxy)
            working_proxies.append(proxy)
    return working_proxies


In [9]:
requests.get("https://www.bloomberg.com/news/articles/2025-04-10/hassett-says-us-well-advanced-in-trade-talks-with-some-nations")

<Response [403]>

In [None]:
try:
    previous_article_df = pd.read_csv('data/Clean_article_text.csv')
    seen_urls = previous_article_df['url'].values
except:
    seen_urls = []

cleaned_content = []

In [6]:
for i in range(article_df.shape[0]):
    row = article_df.iloc[i]
    content = row['content']
    title = row['title']
    #print(title, content)
    url = row['url']
    if url not in seen_urls:
        try:
            cleaned_text = filter_text(title, content)
            if cleaned_text:
                cleaned_content.append(cleaned_text)
            else:
                cleaned_content.append('Error')
        except:
            cleaned_content.append('Error')
        seen_urls.append(url)

In [112]:
urls = article_df['url'].values.tolist()
article_unique = article_df.drop_duplicates(subset=['url'],keep='first')
article_unique['cleaned_text'] = cleaned_content
article_unique['word_count'] = article_unique['cleaned_text'].apply(lambda x: len(x.split()))
clean_text_df = article_unique.loc[article_unique['word_count'] > 10]
clean_text_df.to_csv('data/Clean_article_text.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  article_unique['cleaned_text'] = cleaned_content
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  article_unique['word_count'] = article_unique['cleaned_text'].apply(lambda x: len(x.split()))
