In [None]:
import pandas as pd
import spacy
from spacy.cli import download

In [None]:
# You can skip these and read files locally
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Either read files locally or build your own directory on google drive
news_path = "/content/drive/MyDrive/<Your Directory!>/news data"
file_name = "/rollingstone_music_news"
news = pd.DataFrame()
for i in range(0,1000,100):
    tmp_df = pd.read_csv(news_path+file_name+str(i)+"_"+str(i+100)+".csv")
    news = pd.concat([news, tmp_df])

In [None]:
download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

# Combine title and summary for better context
def extract_person_names(text):
    """
    Extract PERSON entities (full names) from the text using spaCy's NER.
    """
    if pd.isnull(text):  # Handle missing values
        return []

    doc = nlp(text)
    person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    return person_names

# Apply NER on the title and summary fields
news['extracted_names_title'] = news['title'].apply(extract_person_names)
news['extracted_names_summary'] = news['summary'].apply(extract_person_names)

# Combine and deduplicate extracted names for each row
news['extracted_names'] = news.apply(
    lambda row: list(set(row['extracted_names_title'] + row['extracted_names_summary'])),
    axis=1
)

# Create a flat list of all unique names across the dataset
all_unique_names = set(name for names_list in news['extracted_names'] for name in names_list)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Adjust your path
news.to_csv("/content/drive/My Drive/<Your Directory!>/rollingstone_news.csv", index=False)

In [None]:
# Adjust your path
drive_path = '/content/drive/My Drive/<Your Directory!>/extracted_names.txt'
# Save the file to Google Drive
with open(drive_path, "w") as f:
    for name in all_unique_names:
        f.write(name + "\n")

print(f"File saved to {drive_path}")

In [None]:
all_unique_names

{'BBC Concert',
 'Eva Apio',
 'Daughter Excluded From',
 'Kelis',
 'Kid Cudi',
 "Sheryl Crow's",
 'Jagari',
 'Fetty Wap Pleads Guilty',
 'Becky G Cries',
 'Arena Tour',
 'Ian Bairnson',
 'Betty',
 'Rapper Said',
 'Ally Brooke',
 "Bonnie Raitt Showcase New LP '",
 'Jamie xx',
 'Vevo',
 'Found Dead',
 'Taemin',
 'Molly Shannon',
 'Tom Morello',
 'Kurtis Blow',
 'Depeche Mode Showcase',
 'Crow',
 'Zach Bryan Returns',
 'Cissy Houston',
 'Jimmy Iovine Accuser Drops Sexual Abuse Lawsuit',
 'Jarre',
 'Chester Bennington’s',
 'Recover R. Kelly',
 'Taylor Swift Superfan',
 'Olivia Rodrigo',
 'Bruce Springsteen Joins',
 "Sweet Ballad '",
 'Rap Lyrics',
 'Doc McGhee',
 'Lorde',
 'Jeff Landry',
 'Rihanna Soundtracks ‘',
 "Deryck Whibley's",
 'Joan Jett',
 'Spencer Sutherland',
 'Cardi B Appears',
 'P. Diddy',
 'Tierra Whack',
 'Don Henley’s',
 'Foster Hudson',
 'Robert Kelly',
 'AP',
 'Drew Dixon',
 "D'Angelo Join Audible's",
 'Frontman James Hetfield',
 'Lizzo Is Always ‘Chasing',
 'Moon',
 'Sey

In [None]:
news

Unnamed: 0,title,link,summary,category,extracted_names_title,extracted_names_summary,extracted_names
0,Taylor Swift Awards Eras Tour Crew With $197 M...,https://www.rollingstone.com/music/music-news/...,The Eras Tour also earned more than $2 billion...,,"[Taylor Swift Awards Eras, Bonuses]",[],"[Bonuses, Taylor Swift Awards Eras]"
1,Miley Cyrus Celebrates Golden Globe Nomination...,https://www.rollingstone.com/music/music-news/...,"The record, which appears in the Pamela Anders...",,[Miley],[Pamela Anderson-led],"[Pamela Anderson-led, Miley]"
2,Gracie Abrams Announces U.S. ‘Secret of Us Del...,https://www.rollingstone.com/music/music-news/...,"Following the singer's Eras Tour opening run, ...",,[Gracie Abrams],[],[Gracie Abrams]
3,Azealia Banks Demands Apology and $1 Million F...,https://www.rollingstone.com/music/music-news/...,"Last week, Banks said she would be taking lega...",,[Matty Healy],[Banks],"[Matty Healy, Banks]"
4,Liam Payne’s Girlfriend Kate Cassidy Shares Jo...,https://www.rollingstone.com/music/music-news/...,Cassidy previously paid homage to Payne in an ...,,[Liam Payne’s],[],[Liam Payne’s]
...,...,...,...,...,...,...,...
1005,Sylvan Esso Straddle Line Between Pleasure and...,https://www.rollingstone.com/music/music-news/...,Electronic pop duo drop first new music since ...,Music,[Sylvan Esso Straddle Line],[],[Sylvan Esso Straddle Line]
1006,Seventeen Plot North American ‘[Be the Sun]’ Tour,https://www.rollingstone.com/music/music-news/...,K-pop group’s 12-date trek launches in support...,Music,[],[],[]
1007,Pearl Jam Cancel Final U.S. Tour Dates After J...,https://www.rollingstone.com/music/music-news/...,"""Our attention to staying inside the bubble ha...",Music,"[Pearl Jam Cancel Final, Jeff Ament]",[],"[Jeff Ament, Pearl Jam Cancel Final]"
1008,Elton John to Get the Official Documentary Tre...,https://www.rollingstone.com/music/music-news/...,Film will be centered around the musician's on...,Music,[Elton John],[],[Elton John]
