# Import data and libraries necessary:

In [1]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import nltk
import sys
import re

# Download NLTK resources (only needs to be run once)
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')


sys.path.append('..')

In [2]:
df = pd.read_csv("../data/MN-DS-news-classification.csv")

# Find missing values:

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10917 entries, 0 to 10916
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   data_id           10917 non-null  int64 
 1   id                10917 non-null  object
 2   date              10917 non-null  object
 3   source            10917 non-null  object
 4   title             10917 non-null  object
 5   content           10917 non-null  object
 6   author            7605 non-null   object
 7   url               10917 non-null  object
 8   published         10917 non-null  object
 9   published_utc     10917 non-null  int64 
 10  collection_utc    10917 non-null  int64 
 11  category_level_1  10917 non-null  object
 12  category_level_2  10917 non-null  object
dtypes: int64(3), object(10)
memory usage: 1.1+ MB


In [4]:
df.isnull().sum()

data_id                0
id                     0
date                   0
source                 0
title                  0
content                0
author              3312
url                    0
published              0
published_utc          0
collection_utc         0
category_level_1       0
category_level_2       0
dtype: int64

# Feature Selection:

### Strategy:
We will select only the columns relevant for text classification and remove metadata or unbalanced features.

Columns:
- `author` the author's name doesn't define the subject of the news  (REMOVE)
- `data_id, id` are just unique identifiers  (REMOVE)
- `date, published, published_utc, collection_utc` are temporal data. It doesn't help us classify the text (REMOVE)
- `url` Does not contain useful information for the model  (REMOVE)
- `source` the model reads the text to make a prediction, not its source (REMOVE)
- `category_level_2` too many classes. 17 categories at Level 1 (Manageable, robust) in Level 2 there are 109 sub-categories. It is much harder for a model to hit 1 in 109 variants than 1 in 17. (REMOVE)

In [5]:
df_final = df[['title', 'content', 'category_level_1']].copy()

In [6]:
# Check for duplicates based on text content
initial_rows = df_final.shape[0]
df_final = df_final.drop_duplicates(subset=['title', 'content'])
rows_after_dedup = df_final.shape[0]

print(f"Initial rows: {initial_rows}")
print(f"Duplicates removed: {initial_rows - rows_after_dedup}")
print(f"Rows remaining for training: {rows_after_dedup}")

# Check the distribution of the target variable
print("\nTarget Class Distribution (Top 10):")
print(df_final['category_level_1'].value_counts().head(10))

Initial rows: 10917
Duplicates removed: 461
Rows remaining for training: 10456

Target Class Distribution (Top 10):
category_level_1
society                    1081
politics                    887
sport                       882
conflict, war and peace     793
science and technology      766
religion and belief         755
health                      686
labour                      587
environment                 579
human interest              578
Name: count, dtype: int64


# Text Preprocessing:
We apply advanced text cleaning to prepare the data for the TF-IDF vectorizer.

**Techniques applied:**
1.  **Lowercasing:** To ensure consistency (e.g., "Apple" == "apple").
2.  **Regex Cleaning:** Keeping only letters (a-z), removing numbers and punctuation.
3.  **Stopwords Removal:** Removing common words (e.g., "the", "is") that add noise.
4.  **Lemmatization:** Reducing words to their root form (e.g., "running" -> "run") to reduce dimensionality and prevent overfitting.

In [7]:
# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [8]:
def advanced_clean_text(text):
    """
    Cleans text by removing special characters, stopwords,
    and applying lemmatization.
    """

    text = str(text).lower()

    # Keep only letters (remove digits and punctuation)
    text = re.sub(r'[^a-z\s]', ' ', text)

    # Tokenize (split into words)
    words = text.split()

    # Remove stopwords and apply lemmatization
    clean_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # Join back into a single string
    text = ' '.join(clean_words)

    return text

In [9]:
# Combine Title and Content for the model
df_final['raw_text'] = df_final['title'] + " " + df_final['content']

# Apply the cleaning function (This may take a moment)
print("Processing text (Lemmatization & Stopwords)...")
df_final['clean_text'] = df_final['raw_text'].apply(advanced_clean_text)

Processing text (Lemmatization & Stopwords)...


# Save new data clean:

In [10]:
df_final[['title', 'clean_text', 'category_level_1']].to_csv(
    "../data/news_data_preprocessed_final.csv",
    index=False
)