In [1]:
from datasets import load_dataset

ds = load_dataset("Astris/LA-Times-Linked-Headlines")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 3721395/3721395 [00:00<00:00, 5009105.07 examples/s]


In [3]:
from datasets import load_dataset
import pandas as pd
import re

print(ds)

split = list(ds.keys())[0]
df = ds[split].to_pandas()

print("Columns:", df.columns.tolist())
print(df.head(3))

# Try to find a date pattern in string columns (e.g., in URL)
date_pattern = re.compile(r"(\d{4})[/-](\d{2})[/-](\d{2})")

date_col = None
for col in df.columns:
    if df[col].dtype == object:
        sample = df[col].dropna().astype(str).head(2000)
        if sample.str.contains(date_pattern).any():
            date_col = col
            break

if date_col is None:
    raise ValueError("No date-like pattern found in any string column. Pick a column manually.")

print("Using date from column:", date_col)

# Extract date from that column
def extract_date(s):
    m = date_pattern.search(str(s))
    if not m:
        return pd.NaT
    return pd.to_datetime("-".join(m.groups()), errors="coerce")

df["parsed_date"] = df[date_col].apply(extract_date)
df = df.dropna(subset=["parsed_date"])

# Counts per year
df["year"] = df["parsed_date"].dt.year
year_counts = df.groupby("year").size().reset_index(name="count")

# Counts per month (YYYY-MM)
df["month"] = df["parsed_date"].dt.to_period("M")
month_counts = df.groupby("month").size().reset_index(name="count")
month_counts["month"] = month_counts["month"].astype(str)

print("\nCounts per year:")
print(year_counts)

print("\nCounts per month:")
print(month_counts)


DatasetDict({
    train: Dataset({
        features: ['title', 'link', 'month', 'year'],
        num_rows: 3721395
    })
})
Columns: ['title', 'link', 'month', 'year']
                                               title  \
0  Anthony Davis scores 41 points as Lakers hold ...   
1  Prep basketball roundup: Carter Bryant dominat...   
2  Trevor Zegras is injured as Ducks hold off Pre...   

                                                link  month  year  
0  https://www.latimes.com/sports/lakers/story/20...      1  2024  
1  https://www.latimes.com/sports/highschool/stor...      1  2024  
2  https://www.latimes.com/sports/hockey/ducks/st...      1  2024  


  if sample.str.contains(date_pattern).any():
  if sample.str.contains(date_pattern).any():


Using date from column: link

Counts per year:
    year   count
0   1962       1
1   1970      30
2   1973       1
3   1977       1
4   1984       1
5   1985  109007
6   1986  125008
7   1987  131646
8   1988  139709
9   1989  146757
10  1990  163095
11  1991  144411
12  1992  139827
13  1993  124413
14  1994  119479
15  1995  111158
16  1996  107707
17  1997   54851
18  1998       3
19  1999       3
20  2000       1
21  2002       6
22  2003       5
23  2004       2
24  2005       5
25  2006     466
26  2007    9169
27  2008   26167
28  2009   33140
29  2010   40366
30  2011   40108
31  2012   18678
32  2013    4911
33  2014       1
34  2015       2
35  2017       7
36  2018       2
37  2019   19183
38  2020   40349
39  2021   35118
40  2022   33013
41  2023   30468
42  2024     622
43  2050       1

Counts per month:
       month  count
0    1962-10      1
1    1970-02      4
2    1970-03      3
3    1970-04      4
4    1970-05      6
..       ...    ...
329  2023-10   2520
330  2023

In [4]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("Astris/LA-Times-Linked-Headlines")
d = ds["train"].select_columns(["year", "month"])

df = d.to_pandas()
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df["month"] = pd.to_numeric(df["month"], errors="coerce")

# Basic validity filter
df = df.dropna(subset=["year", "month"])
df = df[(df["month"] >= 1) & (df["month"] <= 12)]

# Counts per year
year_counts = df["year"].value_counts().sort_index()
print("Counts per year:")
print(year_counts)

# Counts per month (YYYY-MM)
df["year_month"] = pd.to_datetime(
    df["year"].astype(int).astype(str) + "-" +
    df["month"].astype(int).astype(str).str.zfill(2) + "-01"
)
month_counts = df["year_month"].dt.to_period("M").value_counts().sort_index()
print("\nCounts per month:")
print(month_counts)


Counts per year:
year
1914        1
1915        1
1919        1
1923        1
1926        2
        ...  
2020    45120
2021    38816
2022    35862
2023    32545
2024      662
Name: count, Length: 89, dtype: int64

Counts per month:
year_month
1914-12       1
1915-05       1
1919-10       1
1923-03       1
1926-11       2
           ... 
2023-09    2724
2023-10    2688
2023-11    2504
2023-12    2351
2024-01     662
Freq: M, Name: count, Length: 596, dtype: int64
