In [66]:

import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
import pandas as pd
def fetch_headlines_for_date(date):
    url = f'https://www.nytimes.com/issue/todaysheadlines/{date}/todays-headlines'
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for HTTP errors
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Initialize a list to hold the headlines
    headlines = []
    
    # New York Times articles typically have titles in h2 or h3 tags
    for headline in soup.find_all(['h2', 'h3']):
        title = headline.get_text(strip=True)
        if title:
            headlines.append(title)
    
    # Exclude the last 26 headlines
    return headlines[:-26]

# Open a CSV file for writing
with open('nytimes_headlines_2020_to_2023.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['date', 'headline']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write the header row
    writer.writeheader()
    
    # Loop through each date from 2020 to 2023
    start_date = datetime(2020, 1, 1)
    end_date = datetime(2023, 12, 31)
    current_date = start_date
    
    while current_date <= end_date:
        date_str = current_date.strftime('%Y/%m/%d')
        print(f"Fetching headlines for {date_str}")
        
        try:
            headlines = fetch_headlines_for_date(date_str)
            for headline in headlines:
                writer.writerow({'date': date_str, 'headline': headline})
        except Exception as e:
            print(f"Failed to fetch headlines for {date_str}: {e}")
        
        # Move to the next day
        current_date += timedelta(days=1)

print("Headlines have been saved to nytimes_headlines_2020_to_2023.csv")


Fetching headlines for 2020/01/01
Fetching headlines for 2020/01/02
Fetching headlines for 2020/01/03
Fetching headlines for 2020/01/04
Fetching headlines for 2020/01/05
Fetching headlines for 2020/01/06
Fetching headlines for 2020/01/07
Fetching headlines for 2020/01/08
Fetching headlines for 2020/01/09
Fetching headlines for 2020/01/10
Fetching headlines for 2020/01/11
Fetching headlines for 2020/01/12
Fetching headlines for 2020/01/13
Fetching headlines for 2020/01/14
Fetching headlines for 2020/01/15
Fetching headlines for 2020/01/16
Fetching headlines for 2020/01/17
Fetching headlines for 2020/01/18
Fetching headlines for 2020/01/19
Fetching headlines for 2020/01/20
Fetching headlines for 2020/01/21
Fetching headlines for 2020/01/22
Fetching headlines for 2020/01/23
Fetching headlines for 2020/01/24
Fetching headlines for 2020/01/25
Fetching headlines for 2020/01/26
Fetching headlines for 2020/01/27
Fetching headlines for 2020/01/28
Fetching headlines for 2020/01/29
Fetching headl

Fetching headlines for 2020/08/29
Fetching headlines for 2020/08/30
Fetching headlines for 2020/08/31
Fetching headlines for 2020/09/01
Fetching headlines for 2020/09/02
Fetching headlines for 2020/09/03
Fetching headlines for 2020/09/04
Fetching headlines for 2020/09/05
Fetching headlines for 2020/09/06
Fetching headlines for 2020/09/07
Fetching headlines for 2020/09/08
Fetching headlines for 2020/09/09
Fetching headlines for 2020/09/10
Fetching headlines for 2020/09/11
Fetching headlines for 2020/09/12
Fetching headlines for 2020/09/13
Fetching headlines for 2020/09/14
Fetching headlines for 2020/09/15
Fetching headlines for 2020/09/16
Fetching headlines for 2020/09/17
Fetching headlines for 2020/09/18
Fetching headlines for 2020/09/19
Fetching headlines for 2020/09/20
Fetching headlines for 2020/09/21
Fetching headlines for 2020/09/22
Fetching headlines for 2020/09/23
Fetching headlines for 2020/09/24
Fetching headlines for 2020/09/25
Fetching headlines for 2020/09/26
Fetching headl

Fetching headlines for 2021/04/27
Fetching headlines for 2021/04/28
Fetching headlines for 2021/04/29
Fetching headlines for 2021/04/30
Fetching headlines for 2021/05/01
Fetching headlines for 2021/05/02
Fetching headlines for 2021/05/03
Fetching headlines for 2021/05/04
Fetching headlines for 2021/05/05
Fetching headlines for 2021/05/06
Fetching headlines for 2021/05/07
Fetching headlines for 2021/05/08
Fetching headlines for 2021/05/09
Fetching headlines for 2021/05/10
Fetching headlines for 2021/05/11
Fetching headlines for 2021/05/12
Fetching headlines for 2021/05/13
Fetching headlines for 2021/05/14
Fetching headlines for 2021/05/15
Fetching headlines for 2021/05/16
Fetching headlines for 2021/05/17
Fetching headlines for 2021/05/18
Fetching headlines for 2021/05/19
Fetching headlines for 2021/05/20
Fetching headlines for 2021/05/21
Fetching headlines for 2021/05/22
Fetching headlines for 2021/05/23
Fetching headlines for 2021/05/24
Fetching headlines for 2021/05/25
Fetching headl

Fetching headlines for 2021/12/24
Fetching headlines for 2021/12/25
Fetching headlines for 2021/12/26
Fetching headlines for 2021/12/27
Fetching headlines for 2021/12/28
Fetching headlines for 2021/12/29
Fetching headlines for 2021/12/30
Fetching headlines for 2021/12/31
Fetching headlines for 2022/01/01
Fetching headlines for 2022/01/02
Fetching headlines for 2022/01/03
Fetching headlines for 2022/01/04
Fetching headlines for 2022/01/05
Fetching headlines for 2022/01/06
Fetching headlines for 2022/01/07
Fetching headlines for 2022/01/08
Fetching headlines for 2022/01/09
Fetching headlines for 2022/01/10
Fetching headlines for 2022/01/11
Fetching headlines for 2022/01/12
Fetching headlines for 2022/01/13
Fetching headlines for 2022/01/14
Fetching headlines for 2022/01/15
Fetching headlines for 2022/01/16
Fetching headlines for 2022/01/17
Fetching headlines for 2022/01/18
Fetching headlines for 2022/01/19
Fetching headlines for 2022/01/20
Fetching headlines for 2022/01/21
Fetching headl

Fetching headlines for 2022/08/22
Fetching headlines for 2022/08/23
Fetching headlines for 2022/08/24
Fetching headlines for 2022/08/25
Fetching headlines for 2022/08/26
Fetching headlines for 2022/08/27
Fetching headlines for 2022/08/28
Fetching headlines for 2022/08/29
Fetching headlines for 2022/08/30
Fetching headlines for 2022/08/31
Fetching headlines for 2022/09/01
Fetching headlines for 2022/09/02
Fetching headlines for 2022/09/03
Fetching headlines for 2022/09/04
Fetching headlines for 2022/09/05
Fetching headlines for 2022/09/06
Fetching headlines for 2022/09/07
Fetching headlines for 2022/09/08
Fetching headlines for 2022/09/09
Fetching headlines for 2022/09/10
Fetching headlines for 2022/09/11
Fetching headlines for 2022/09/12
Fetching headlines for 2022/09/13
Fetching headlines for 2022/09/14
Fetching headlines for 2022/09/15
Fetching headlines for 2022/09/16
Fetching headlines for 2022/09/17
Fetching headlines for 2022/09/18
Fetching headlines for 2022/09/19
Fetching headl

Fetching headlines for 2023/04/20
Fetching headlines for 2023/04/21
Fetching headlines for 2023/04/22
Fetching headlines for 2023/04/23
Fetching headlines for 2023/04/24
Fetching headlines for 2023/04/25
Fetching headlines for 2023/04/26
Fetching headlines for 2023/04/27
Fetching headlines for 2023/04/28
Fetching headlines for 2023/04/29
Fetching headlines for 2023/04/30
Fetching headlines for 2023/05/01
Fetching headlines for 2023/05/02
Fetching headlines for 2023/05/03
Fetching headlines for 2023/05/04
Fetching headlines for 2023/05/05
Fetching headlines for 2023/05/06
Fetching headlines for 2023/05/07
Fetching headlines for 2023/05/08
Fetching headlines for 2023/05/09
Fetching headlines for 2023/05/10
Fetching headlines for 2023/05/11
Fetching headlines for 2023/05/12
Fetching headlines for 2023/05/13
Fetching headlines for 2023/05/14
Fetching headlines for 2023/05/15
Fetching headlines for 2023/05/16
Fetching headlines for 2023/05/17
Fetching headlines for 2023/05/18
Fetching headl

Fetching headlines for 2023/12/13
Fetching headlines for 2023/12/14
Fetching headlines for 2023/12/15
Fetching headlines for 2023/12/16
Fetching headlines for 2023/12/17
Fetching headlines for 2023/12/18
Fetching headlines for 2023/12/19
Fetching headlines for 2023/12/20
Fetching headlines for 2023/12/21
Fetching headlines for 2023/12/22
Fetching headlines for 2023/12/23
Fetching headlines for 2023/12/24
Fetching headlines for 2023/12/25
Fetching headlines for 2023/12/26
Fetching headlines for 2023/12/27
Fetching headlines for 2023/12/28
Fetching headlines for 2023/12/29
Fetching headlines for 2023/12/30
Fetching headlines for 2023/12/31
Headlines have been saved to nytimes_headlines_2020_to_2023.csv


In [102]:
from collections import defaultdict

def merge_sentences_by_date(input_csv, output_csv):
    # Initialize a dictionary to hold merged sentences by date
    date_sentences = defaultdict(list)
    
    # Read the input CSV file
    with open(input_csv, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            date = row['date']
            sentence = row['merged_headlines']
            date_sentences[date].append(sentence)
    
    # Write the merged results to the output CSV file
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['date', 'merged_headlines']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write the header row
        writer.writeheader()
        
        # Write the merged sentences
        for date, sentences in date_sentences.items():
            merged_sentences = ' '.join(sentences)
            writer.writerow({'date': date, 'merged_headlines': merged_sentences})

# Example usage
input_csv = 'nytimes_headlines_2020_to_2023.csv'
output_csv = 'nytimes_headlines_2020_to_2023.csv'
merge_sentences_by_date(input_csv, output_csv)

merge_sentences_by_date(input_csv, output_csv)

print(f"Merged headlines have been saved to {output_csv}")

Merged headlines have been saved to nytimes_headlines_2020_to_2023.csv


In [103]:
pd.read_csv(input_csv)

Unnamed: 0,date,merged_headlines
0,2020/01/01,Top News Protesters Attack U.S. Embassy in Ira...
1,2020/01/02,Top News Pro-Iranian Protesters End Siege of U...
2,2020/01/03,Top News U.S. Strike in Iraq Kills Qassim Sule...
3,2020/01/04,Top News U.S. and Iran Exchange More Threats a...
4,2020/01/05,"Top News As Tensions With Iran Escalated, Trum..."
...,...,...
1455,2023/12/27,Top News The Covenant Parents Aren’t Going to ...
1456,2023/12/28,Top News Skepticism Grows Over Israel’s Abilit...
1457,2023/12/29,Top News Maine Joins Colorado in Finding Trump...
1458,2023/12/30,Top News Where Was the Israeli Military? How A...


In [104]:
pd.read_csv('nytimes_headlines_2020_to_2023.csv')

Unnamed: 0,date,merged_headlines
0,2020/01/01,Top News Protesters Attack U.S. Embassy in Ira...
1,2020/01/02,Top News Pro-Iranian Protesters End Siege of U...
2,2020/01/03,Top News U.S. Strike in Iraq Kills Qassim Sule...
3,2020/01/04,Top News U.S. and Iran Exchange More Threats a...
4,2020/01/05,"Top News As Tensions With Iran Escalated, Trum..."
...,...,...
1455,2023/12/27,Top News The Covenant Parents Aren’t Going to ...
1456,2023/12/28,Top News Skepticism Grows Over Israel’s Abilit...
1457,2023/12/29,Top News Maine Joins Colorado in Finding Trump...
1458,2023/12/30,Top News Where Was the Israeli Military? How A...


In [105]:
def adjust_weekend_headlines(input_csv, output_csv):
    # Read the CSV file into a DataFrame
    data = pd.read_csv(input_csv, parse_dates=['date'])

    # Ensure the date column is in datetime format
    data['date'] = pd.to_datetime(data['date'])

    # Sort the DataFrame by date
    data = data.sort_values(by='date')

    # Initialize a dictionary to collect headlines for each Monday
    monday_headlines = {}

    # Iterate over the rows to collect weekend headlines and associate them with the following Monday
    for index, row in data.iterrows():
        date = row['date']
        weekday = date.weekday()
        if weekday == 5:  # Saturday
            next_monday = date + timedelta(days=2)
        elif weekday == 6:  # Sunday
            next_monday = date + timedelta(days=1)
        else:
            continue

        if next_monday not in monday_headlines:
            monday_headlines[next_monday] = {'merged_headlines': ''}

        monday_headlines[next_monday]['merged_headlines'] += ' ' + row['merged_headlines']

    # Remove Saturday and Sunday rows
    weekday_data = data[~data['date'].dt.weekday.isin([5, 6])]

    # Append weekend headlines to the following Monday
    for next_monday, values in monday_headlines.items():
        if next_monday in weekday_data['date'].values:
            weekday_data.loc[weekday_data['date'] == next_monday, 'merged_headlines'] += values['merged_headlines']
        else:
            new_row = pd.DataFrame({'date': [next_monday], 'merged_headlines': [values['merged_headlines']]})
            weekday_data = pd.concat([weekday_data, new_row], ignore_index=True)

    # Sort the DataFrame by date again after modification
    weekday_data = weekday_data.sort_values(by='date').reset_index(drop=True)

    # Write the updated DataFrame to the output CSV file
    weekday_data.to_csv(output_csv, index=False, encoding='utf-8')

In [106]:
output_csv1 = 'final_headlines_weekday.csv'
input_csv1 = 'nytimes_headlines_2020_to_2023.csv'
adjust_weekend_headlines(input_csv1, output_csv1)

print(f"Weekend headlines have been adjusted and saved to {output_csv1}")

Weekend headlines have been adjusted and saved to final_headlines_weekday.csv


In [107]:
pd.read_csv(output_csv1)

Unnamed: 0,date,merged_headlines
0,2020-01-01,Top News Protesters Attack U.S. Embassy in Ira...
1,2020-01-02,Top News Pro-Iranian Protesters End Siege of U...
2,2020-01-03,Top News U.S. Strike in Iraq Kills Qassim Sule...
3,2020-01-06,Top News Iran Ends Nuclear Limits as Killing o...
4,2020-01-07,Top News Khamenei Wants to Put Iran’s Stamp on...
...,...,...
1038,2023-12-26,Top News Netanyahu Visits Gaza as Palestinians...
1039,2023-12-27,Top News The Covenant Parents Aren’t Going to ...
1040,2023-12-28,Top News Skepticism Grows Over Israel’s Abilit...
1041,2023-12-29,Top News Maine Joins Colorado in Finding Trump...
