In [2]:

import pandas as pd
import re
import json
import time
import numpy as np
import spacy

In [3]:
article_df = pd.read_csv('data/complete_article_data.csv')
article_df.head()

Unnamed: 0,url,title,outlet,bias,date,content
0,https://www.bbc.com/arabic/articles/c4g3rqvpnq...,Trump signs order to impose tariffs on goods i...,BBC News,center,2025-02-01,صدر الصورة، Getty Images أعلنت كندا فرض رسوم ج...
1,https://www.thehindu.com/news/international/tr...,Trump signs executive order to impose tariffs ...,The Hindu,leanLeft,2025-02-01,"February 2, 2025e-Paper \n\t\t\t\t\t\t\t\t\tTh..."
2,https://www.news18.com/world/us-imposes-tariff...,"Canada Imposes 25% Tariffs Against US; Mexico,...",News18 India,leanRight,2025-02-01,Canada imposed 25 per cent tariffs on $155 bil...
3,https://apnews.com/article/trump-tariffs-mexic...,What do Trump's executive orders say on tariff...,Associated Press News,leanLeft,2025-02-01,Copyright 2025 The Associated Press. All Right...
4,https://www.ndtv.com/world-news/donald-trump-i...,Explained: What Are Tariffs Imposed By Donald ...,NDTV,leanRight,2025-02-01,US President Donald Trump signed an order impo...


In [4]:
article_df['content'].values[1].strip()
article_df['content'] = article_df['content'].apply(lambda x: x.strip())

article_df.head()

Unnamed: 0,url,title,outlet,bias,date,content
0,https://www.bbc.com/arabic/articles/c4g3rqvpnq...,Trump signs order to impose tariffs on goods i...,BBC News,center,2025-02-01,صدر الصورة، Getty Images أعلنت كندا فرض رسوم ج...
1,https://www.thehindu.com/news/international/tr...,Trump signs executive order to impose tariffs ...,The Hindu,leanLeft,2025-02-01,"February 2, 2025e-Paper \n\t\t\t\t\t\t\t\t\tTh..."
2,https://www.news18.com/world/us-imposes-tariff...,"Canada Imposes 25% Tariffs Against US; Mexico,...",News18 India,leanRight,2025-02-01,Canada imposed 25 per cent tariffs on $155 bil...
3,https://apnews.com/article/trump-tariffs-mexic...,What do Trump's executive orders say on tariff...,Associated Press News,leanLeft,2025-02-01,Copyright 2025 The Associated Press. All Right...
4,https://www.ndtv.com/world-news/donald-trump-i...,Explained: What Are Tariffs Imposed By Donald ...,NDTV,leanRight,2025-02-01,US President Donald Trump signed an order impo...


In [9]:
article_df['content'][1]

"February 2, 2025e-Paper \n\t\t\t\t\t\t\t\t\tThe Hindu On Books\n\t\t\t\t\t\t\t\t\tBooks of the week, reviews, excerpts, new titles and features.\n \n\t\t\t\t\t\t\t\t\tData Point\n\t\t\t\t\t\t\t\t\tDecoding the headlines with facts, figures, and numbers\n \n\t\t\t\t\t\t\t\t\tFirst Day First Show\n\t\t\t\t\t\t\t\t\tNews and reviews from the world of cinema and streaming.\n \n\t\t\t\t\t\t\t\t\tHealth Matters\n\t\t\t\t\t\t\t\t\tRamya Kannan writes to you on getting to good health, and staying there\n \n\t\t\t\t\t\t\t\t\tThe View From India\n\t\t\t\t\t\t\t\t\tLooking at World Affairs from the Indian perspective.\n \n\t\t\t\t\t\t\t\t\tScience For All\n\t\t\t\t\t\t\t\t\tThe weekly newsletter from science writers takes the jargon out of science and puts the fun in!\n \n\t\t\t\t\t\t\t\t\tKarnataka Today\n\t\t\t\t\t\t\t\t\tYour daily dose of news highlights from Karnataka\n \n\t\t\t\t\t\t\t\t\tToday's Cache\n\t\t\t\t\t\t\t\t\tYour download of the top 5 technology stories of the day.\n February 

In [97]:
def filter_text(title: str, text: str):
    nlp1 = spacy.load("en_core_web_sm")
    nlp2 = spacy.load("en_core_web_sm")
    text = re.sub(r'\n\s*\n', '\n', text)  # Remove excess blank lines
    text = re.sub(r'\t\s*\t', '\t', text)
    text = text.encode("utf-8", errors='ignore').decode("utf-8") #Replacing unicode characters
    text = text.strip()
    doc: spacy.tokens.doc.Doc = nlp1(text)
    title = nlp2(title)
    first_subject = False
    relevant_lines = []
    article_flags = []

    #Making a list of relevant subjects/nouns in the title
    for word in title:
        if word.pos_ == 'PROPN':
            article_flags.append(word)
        elif word.dep_ in ['nsubj', 'ROOT']:
            article_flags.append(word)
        elif word.ent_id_ in ['PERSON', 'ORG', 'GPE']:
            article_flags.append(word)
    article_flags = [str(x).lower() for x in article_flags]

    #Keeping all sentences after first appearance of relevant word
    doc_sents = [x for x in doc.sents]
    #print(doc_sents)
    doc_reverse = list(reversed(doc_sents))
    for sentence in doc_sents:
        if not first_subject:
            for word in sentence:
                word = str(word).lower()
                if word in article_flags:
                    first_subject = True
                    relevant_lines.append(str(sentence))
        else:
            relevant_lines.append(str(sentence))
    
    #Removing all ending text after the last mention of a relevant entity
    last_subject=False
    for ind, sentence in enumerate(doc_reverse):
        if not last_subject:
            for word in sentence:
                word = str(word).lower()
                if word in article_flags:
                    last_subject = True
                    last_subj_ind = ind

    relevant_lines = relevant_lines[:-last_subj_ind]
    seen_lines = []
    for sentence in relevant_lines:
        if sentence in seen_lines:
            continue
        seen_lines.append(sentence.strip())
    #print(seen_lines, relevant_lines)

    return ' '.join(seen_lines)



In [100]:
cleaned_content = []
seen_urls = []


In [104]:
for i in range(article_df.shape[0]):
    row = article_df.iloc[i]
    content = row['content']
    title = row['title']
    #print(title, content)
    url = row['url']
    if url not in seen_urls:
        try:
            cleaned_text = filter_text(title, content)
            if cleaned_text:
                cleaned_content.append(cleaned_text)
            else:
                cleaned_content.append('Error')
        except:
            cleaned_content.append('Error')
        seen_urls.append(url)

In [112]:
urls = article_df['url'].values.tolist()
article_unique = article_df.drop_duplicates(subset=['url'],keep='first')
article_unique['cleaned_text'] = cleaned_content
article_unique['word_count'] = article_unique['cleaned_text'].apply(lambda x: len(x.split()))
clean_text_df = article_unique.loc[article_unique['word_count'] > 10]
clean_text_df.to_csv('data/Clean_article_text.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  article_unique['cleaned_text'] = cleaned_content
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  article_unique['word_count'] = article_unique['cleaned_text'].apply(lambda x: len(x.split()))
