In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from wordcloud import WordCloud
import nltk
import spacy
from nltk.corpus import stopwords
from collections import Counter
import seaborn as sns
import pickle



Topic Modeling
Objective: Discover the underlying themes or topics present in the news articles.
Approach:
Apply unsupervised learning techniques like Latent Dirichlet Allocation (LDA) or Non-Negative Matrix Factorization (NMF) to uncover topics.

In [5]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaisy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
file_path = "train.csv"
df = pd.read_csv(file_path)


In [22]:
df.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [24]:
# Check for empty rows in 'article' and 'highlights' columns
empty_article_rows = df['article'].str.strip() == ''
empty_highlights_rows = df['highlights'].str.strip() == ''

# Combine the conditions to find rows where either column is empty
empty_rows = empty_article_rows | empty_highlights_rows

# Print the number of empty rows found
print(f"Number of empty rows: {empty_rows.sum()}")

Number of empty rows: 0


In [10]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

In [11]:
# Initial Exploration and Cleaning #5hrs 29mins
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and multiple spaces
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)
    
    # Lemmatization
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    
    return lemmatized_text

df['article'] = df['article'].apply(clean_text)
df['highlights'] = df['highlights'].apply(clean_text)

# Tokenization and Padding
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['highlights'])
X = df['article'].tolist()
y = df['highlights'].tolist()
