# File Description:
##### • This file holds the code to pre-process the dataset and the texts especially, article column.

# Importing necesary libraries

In [1]:
import polars, re

In [11]:
dataset = polars.read_csv('../../datasets/News dataset.csv')
dataset.head()

article,highlights,id
str,str,str
"""LONDON, England (Reuters) -- H…","""Harry Potter star Daniel Radcl…","""42c027e4ff9730fbb3de84c1af0d2c…"
"""Editor's note: In our Behind t…","""Mentally ill inmates in Miami …","""ee8871b15c50d0db17b0179a6d2bea…"
"""MINNEAPOLIS, Minnesota (CNN) -…","""NEW: ""I thought I was going to…","""06352019a19ae31e527f37f7571c6d…"
"""WASHINGTON (CNN) -- Doctors re…","""Five small polyps found during…","""24521a2abb2e1f5e34e6824e0f9e56…"
"""(CNN) -- The National Footbal…","""NEW: NFL chief, Atlanta Falcon…","""7fe70cc8b12fab2d0a258fababf7d9…"


In [12]:
dataset.shape

(311971, 3)

# Objectives:
<ol type='a'>
    <li>Check for missing values</li>
    <li>Removing unwanted columns</li>
    <li>Removing any unwanted spaces from the article column</li>
    <li>Replacing curly apostrophes and special quotes with standard UTF-8 apostrophes (e.g., ’ → ')</li>
    <li>Removing apostrophes if necessary for normalization</li>
    <li>Removing author name and publishing/update date as they are noisy data</li>
    <li>Removing legal disclaimers, copyright notices, and redistribution warnings</li>
    <li>Removing attribution or contribution credits (e.g., “CNN’s John Doe contributed to this report.”)</li>
    <li>Removing email addresses and social media handles (e.g., @CNN)</li>
    <li>Removing hyperlinks and web URLs from the text</li>
    <li>Removing source tags and image credits like (Reuters), (AP), (Getty)</li>
    <li>Removing redundant phrases like “Scroll down for video”, “Read more at”, “Related Articles”</li>
    <li>Removing text enclosed in square or curly brackets (e.g., [Editor’s note])</li>
    <li>Removing HTML tags if present</li>
    <li>Normalizing multiple dots, pipes, and extra spaces or newlines</li>
</ol>



# Checking for mising values

In [13]:
dataset.null_count()

article,highlights,id
u32,u32,u32
0,0,0


# Removing unwanted columns

In [14]:
dataset = dataset.drop('id')
dataset.head()

article,highlights
str,str
"""LONDON, England (Reuters) -- H…","""Harry Potter star Daniel Radcl…"
"""Editor's note: In our Behind t…","""Mentally ill inmates in Miami …"
"""MINNEAPOLIS, Minnesota (CNN) -…","""NEW: ""I thought I was going to…"
"""WASHINGTON (CNN) -- Doctors re…","""Five small polyps found during…"
"""(CNN) -- The National Footbal…","""NEW: NFL chief, Atlanta Falcon…"


# Performing Objectives (from c to o)

In [24]:
def clean_article(text):
    # Normalize apostrophes
    text = text.replace("’", "'").replace("‘", "'")

    # Remove "By Author" lines
    text = re.sub(r'by\s+[\w\s\.,]+(?:\n|\|)', '', text, flags=re.IGNORECASE)

    # Remove PUBLISHED/UPDATED timestamps
    text = re.sub(r'(PUBLISHED|UPDATED|Last updated).*?(\n|\.|$)', '', text, flags=re.IGNORECASE)

    # Remove legal disclaimers, copyrights, and redistribution warnings
    text = re.sub(r'(e-?mail to a friend.*?|all rights reserved.*?|this material may not be published.*?|copyright \d{4}.*?|©\s*\d{4}.*?)($|\n|\.)', '', text, flags=re.IGNORECASE)

    # Remove email addresses and Twitter handles
    text = re.sub(r'\S+@\S+', '', text)      # Emails
    text = re.sub(r'@\w+', '', text)         # Twitter handles

    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\.\S+', '', text)

    # Remove source tags and image captions
    text = re.sub(r'\(Reuters\)|\(AP\)|\(Getty.*?\)', '', text)
    text = re.sub(r'Scroll down for .*?(\.|\n)', '', text, flags=re.IGNORECASE)

    # Remove redundant phrases
    text = re.sub(r'(Read more at|Full story|Related Articles).*?(\n|$)', '', text, flags=re.IGNORECASE)

    # Remove attribution/contribution credits
    text = re.sub(r'(contributed to this report.*?|with (additional )?reporting by.*?|reporting by .*?editing by .*?)\.?', '', text, flags=re.IGNORECASE)

    # Remove text in square or curly brackets
    text = re.sub(r'\[.*?\]|\{.*?\}', '', text)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Normalize whitespace and punctuation
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'\s*\|\s*', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\n+', ' ', text)

    return text.strip()

In [32]:
dataset = dataset.with_columns(
    polars.col('article').
    map_elements(
        function= lambda t: clean_article(t),
        return_dtype=polars.Utf8 
    ).
    alias('article')
)

dataset.head()

article,highlights
str,str
"""LONDON, England -- Harry Potte…","""Harry Potter star Daniel Radcl…"
"""Editor's note: In our Behind t…","""Mentally ill inmates in Miami …"
"""MINNEAPOLIS, Minnesota (CNN) -…","""NEW: ""I thought I was going to…"
"""WASHINGTON (CNN) -- Doctors re…","""Five small polyps found during…"
"""(CNN) -- The National Football…","""NEW: NFL chief, Atlanta Falcon…"


In [33]:
dataset.write_csv('../../datasets/Cleaned News dataset.csv')