In [None]:
from textblob import TextBlob
import pandas as pd

In [None]:
from google.colab import drive
# Mount your google drive or use local paths
drive.mount('/content/drive')

In [None]:
# Read the news data
# Adjust your path
news_path = "/content/drive/MyDrive//news data/rollingstone_news.csv"
news = pd.read_csv(news_path)

In [None]:
news.head(5)

Unnamed: 0,title,link,summary,category,extracted_names_title,extracted_names_summary,extracted_names
0,Taylor Swift Awards Eras Tour Crew With $197 M...,https://www.rollingstone.com/music/music-news/...,The Eras Tour also earned more than $2 billion...,,"['Taylor Swift Awards Eras', 'Bonuses']",[],"['Bonuses', 'Taylor Swift Awards Eras']"
1,Miley Cyrus Celebrates Golden Globe Nomination...,https://www.rollingstone.com/music/music-news/...,"The record, which appears in the Pamela Anders...",,['Miley'],['Pamela Anderson-led'],"['Pamela Anderson-led', 'Miley']"
2,Gracie Abrams Announces U.S. ‘Secret of Us Del...,https://www.rollingstone.com/music/music-news/...,"Following the singer's Eras Tour opening run, ...",,['Gracie Abrams'],[],['Gracie Abrams']
3,Azealia Banks Demands Apology and $1 Million F...,https://www.rollingstone.com/music/music-news/...,"Last week, Banks said she would be taking lega...",,['Matty Healy'],['Banks'],"['Matty Healy', 'Banks']"
4,Liam Payne’s Girlfriend Kate Cassidy Shares Jo...,https://www.rollingstone.com/music/music-news/...,Cassidy previously paid homage to Payne in an ...,,['Liam Payne’s'],[],['Liam Payne’s']


In [None]:
# Read spotify data
# Adjust your path
spotify_path = "/content/drive/MyDrive//spotify data/"
file_name = "spotify_artists_data_part"
artists = pd.DataFrame()
for i in range(3):
  tmp_df = pd.read_csv(spotify_path+file_name+str(i+1)+".csv")
  artists = pd.concat([artists, tmp_df])

In [None]:
artists.head(5)

Unnamed: 0,artist,popularity,followers,genres
0,Taylor Swift,100,127838324,pop
1,The Weeknd,96,95971547,"canadian contemporary r&b, canadian pop, pop"
2,Bad Bunny,96,87698874,"reggaeton, trap latino, urbano latino"
3,Drake,96,94133053,"canadian hip hop, canadian pop, hip hop, pop r..."
4,Ed Sheeran,90,117757199,"pop, singer-songwriter pop, uk pop"


#### Data Preprocessing

News Data Cleaning

In [None]:
# 1. Drop rows where both title and summary are missing
news = news.dropna(subset=['title', 'summary'], how='all')

# 2. Clean and deduplicate extracted names
news['extracted_names'] = news['extracted_names'].apply(
    lambda x: list(set(eval(x))) if pd.notnull(x) else []
)

# 3. Normalize text columns (lowercase)
news['title'] = news['title'].str.lower()
news['summary'] = news['summary'].str.lower()

# 4. Filter rows with valid artist mentions
news = news[news['extracted_names'].apply(lambda x: len(x) > 0)]

# 5. Explode news data to create a row for each artist in `extracted_names`
news_exploded = news.explode('extracted_names')
news_exploded = news_exploded.rename(columns={'extracted_names': 'artist'})

In [None]:
# Normalize and clean the 'artist' column in news data
def clean_artist_names(artist):
    if not isinstance(artist, str):
        return None  # Discard non-string entries

    artist = artist.lower().strip()  # Convert to lowercase and strip whitespace
    artist = artist.replace("’", "'")  # Normalize quotes

    # Remove unwanted descriptors (e.g., "Awards", "led")
    invalid_words = ['awards', 'led', 'crew', 'tour', 'concert', 'bonus', 'bonuses', 'seeks', 'dismissal', 'eras']
    artist = ' '.join([word for word in artist.split() if word not in invalid_words])

    # Remove numeric or short entries
    if artist.isnumeric() or len(artist) < 2:
        return None

    return artist

# 6. Apply cleaning to the news data artist column
news_exploded['artist'] = news_exploded['artist'].apply(clean_artist_names)

# 7. Keeping rows with unique combinations of title and extracted_names
news_exploded = news_exploded.drop_duplicates(subset=['title', 'artist'])

# 8. Drop rows with None in artist name column
news_exploded = news_exploded.dropna(subset=['artist'])

In [None]:
print('Title:\n',news_exploded.loc[0, 'title'])
print("Extracted Names by NER:\n", news_exploded.loc[0, 'extracted_names_title'].lower())
print("Final artist name after pre-processing:\n", news_exploded.loc[0, 'artist'])

Title:
 taylor swift awards eras tour crew with $197 million in bonuses
Extracted Names by NER:
 ['taylor swift awards eras', 'bonuses']
Final artist name after pre-processing:
 taylor swift


Spotify Data Cleaning

In [None]:
# 1. Drop rows with missing essential fields
artists = artists.dropna(subset=['artist', 'popularity', 'followers'])

# 2. Normalize genres column (lowercase and consistent formatting)
artists['genres'] = artists['genres'].str.lower().str.replace(', ', ',').str.strip()

# 3. Normalize artist names (strip whitespace and lowercase)
artists['artist'] = artists['artist'].str.strip().str.lower()

# 4. Keeping rows with unique artist names
artists = artists.drop_duplicates(subset=['artist'])

In [None]:
artists

Unnamed: 0,artist,popularity,followers,genres
0,taylor swift,100,127838324,pop
1,the weeknd,96,95971547,"canadian contemporary r&b,canadian pop,pop"
2,bad bunny,96,87698874,"reggaeton,trap latino,urbano latino"
3,drake,96,94133053,"canadian hip hop,canadian pop,hip hop,pop rap,rap"
4,ed sheeran,90,117757199,"pop,singer-songwriter pop,uk pop"
...,...,...,...,...
2630,saturnine,27,729,
2633,bon appétit musique,27,3482,
2634,set da trend,45,104172,bronx drill
2635,grace jones,55,343145,art pop


#### Sentiment Analysis for News Articles

In [None]:
# Function to compute sentiment
def analyze_sentiment(text):
    if pd.isnull(text):
        return 0  # Neutral for missing text
    sentiment = TextBlob(text).sentiment.polarity
    return sentiment

# Apply sentiment analysis on title and summary
news_exploded['sentiment_score'] = news_exploded['title'].apply(analyze_sentiment) + news_exploded['summary'].apply(analyze_sentiment)

# Categorize sentiment
news_exploded['sentiment_category'] = news_exploded['sentiment_score'].apply(
    lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral')
)

In [None]:
# Aggregate news data per artist
artist_news_stats = news_exploded.groupby('artist').agg(
    number_of_articles=('sentiment_score', 'count'),
    positive_articles=('sentiment_category', lambda x: (x == 'positive').sum()),
    negative_articles=('sentiment_category', lambda x: (x == 'negative').sum()),
    average_sentiment_score=('sentiment_score', 'mean')
).reset_index()

In [None]:
# Merge the aggregated news stats with Spotify data
final_artist_data = pd.merge(artists, artist_news_stats, on='artist', how='inner')

# Save the final dataset
# Adjust your path
final_artist_data.to_csv('/content/drive/MyDrive//final_artist_statistics.csv', index=False)
print("Final artist statistics saved to 'final_artist_statistics.csv'.")

Final artist statistics saved to 'final_artist_statistics.csv'.


In [None]:
final_artist_data

Unnamed: 0,artist,popularity,followers,genres,number_of_articles,positive_articles,negative_articles,average_sentiment_score
0,taylor swift,100,127838324,pop,122,70,32,0.162915
1,bad bunny,96,87698874,"reggaeton,trap latino,urbano latino",24,2,22,-0.391230
2,drake,96,94133053,"canadian hip hop,canadian pop,hip hop,pop rap,rap",17,7,6,0.040255
3,ed sheeran,90,117757199,"pop,singer-songwriter pop,uk pop",54,31,16,0.097984
4,billie eilish,96,103636544,"art pop,pop",80,42,18,0.110963
...,...,...,...,...,...,...,...,...
2212,louis cato,28,7783,,1,0,1,-0.341270
2213,gillis,1,22,,1,0,1,-0.130556
2214,saturnine,27,729,,1,1,0,0.136364
2215,grace jones,55,343145,art pop,3,1,1,0.112500
