## Data Cleaning 

In [None]:
# Data Manipulation
import pandas as pd
import re
from datetime import datetime
# Data Visualization
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
# Statistical Modeling
import statsmodels.api as sm
# Stopwords and lemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
# Natural Language Processing (NLP)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
# Web Scraping
from bs4 import BeautifulSoup
# NLP with Pre-trained Models
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
file_path = "data/Headlines.xlsx"
# Load the Excel file into a pandas DataFrame
df = pd.read_excel(file_path)
# Display the first few rows
df.head()

In [None]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# Function to clean text
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words and lemmatize
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join words back into a single string
    cleaned_text = " ".join(words)
    
    return cleaned_text

# Apply the cleaning function to the 'title' column
df['cleaned_title'] = df['title'].apply(clean_text)

In [None]:
# Tokenize the headlines into sentences and then into words using NLTK
df['Sentences'] = df['cleaned_title'].apply(sent_tokenize)
df['Tokens'] = df['Sentences'].apply(lambda sents: [word_tokenize(sent) for sent in sents])

# Display the cleaned and tokenized data
df.head()