IMPORT SECTION

In [None]:
import re
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import csv
import json
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
import nltk

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('cmudict')
# nltk.download('vader_lexicon')

from nltk.corpus import cmudict
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

SPOTIPY_CLIENT_ID = "e4d756536f474fdf8fdee07780ea76d1"
SPOTIPY_CLIENT_SECRET = "b75eb5a3ebb14541b61e77626285cfd8"
spotify = spotipy.Spotify(
    client_credentials_manager=SpotifyClientCredentials(
        client_id=SPOTIPY_CLIENT_ID,
        client_secret=SPOTIPY_CLIENT_SECRET
    )
)

GENRE_LIST = [
    'pop', 'rock', 'hip-hop', 'rap', 'r&b', 'soul', 'electronic', 'dance', 'country', 'jazz', 'classical',
    'reggae', 'alternative', 'indie', 'folk', 'metal', 'punk', 'blues', 'world', 'funk', 'disco', 'gospel'
]

GENERAL UTILS

In [None]:
def fix_genre(genre_string):
    mapping = {'hip hop': 'hip-hop', 'hip pop': 'hip-hop', "metalcore" : "metal"}
    if genre_string.lower() not in mapping.keys():
        return genre_string

    return mapping.get(genre_string)


def split_list_items(list_items):
    merged_list = [item.split() for item in list_items]
    flattened_list = [fix_genre(word) for sublist in merged_list for word in sublist]
    return flattened_list


def remove_non_genres(word_list):
    return [item for item in word_list if item in GENRE_LIST]


def get_general_genre(song_genres):
    translated = [fix_genre(item) for item in song_genres]
    new_list = split_list_items(translated)
    clean_list = remove_non_genres(new_list)
    genre_counts = Counter(clean_list)

    most_common_genre = genre_counts.most_common(1)
    return most_common_genre[0][0]

def purify_text(text):
    # Make replace table
    trans_table = str.maketrans({'&': 'and', 'é': 'e'})
    text = text.translate(trans_table)
    # Remove anything inside parentheses (including parentheses)
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra spaces
    text = ' '.join(text.split())
    # Replace spaces with dash
    text = re.sub(r'\s', '-', text)

    return text.lower()


def save_lyrics(txt, file_path):
    with open(file_path, 'w') as file:
        file.write(txt)


def remove_file(file_path):
    try:
        os.remove(file_path)
        print("File removed successfully")
    except OSError as e:
        pass


def get_file_extension(filepath):
    # Get the file extension from the file path
    _, file_extension = os.path.splitext(filepath)

    # Check if the file extension is supported
    if file_extension not in ['.json', '.csv']:
        raise ValueError("Unsupported file format. Only 'json' and 'csv' formats are supported.")

    return file_extension


def save_dataset(data, filepath, overwrite=False):
    file_extension = get_file_extension(filepath)

    # Check if the file already exists and overwrite is not enabled
    if os.path.exists(filepath) and not overwrite:
        raise FileExistsError("File already exists. Set 'overwrite' to True to overwrite the file.")

    # Save data as JSON
    if file_extension == '.json':
        if isinstance(data, pd.DataFrame):
            data.to_json(filepath, orient='records')

        else:
            with open(filepath, 'w') as file:
                json.dump(data, file)

    # Save data as CSV
    elif file_extension == '.csv':
        if isinstance(data, pd.DataFrame):
            data.to_csv(filepath, index=False)

        else:
            with open(filepath, 'w', newline='') as file:
                writer = csv.writer(file)
                # Assuming the dictionary has consistent keys across all entries
                writer.writerow(data[0].keys())  # Write header
                writer.writerows([entry.values() for entry in data])

    print(f"Data successfully saved as '{filepath}'.")


def load_dataset(filepath):
    file_extension = get_file_extension(filepath)

    # Load data from JSON
    if file_extension == '.json':
        with open(filepath, 'r') as file:
            data = json.load(file)

    # Load data from CSV
    elif file_extension == '.csv':
        with open(filepath, 'r', newline='') as file:
            reader = csv.DictReader(file)
            data = [dict(row) for row in reader]

    else:
        return

    # Convert data to DataFrame
    df = pd.DataFrame(data)
    return df


SPOTIFY UTILS

In [None]:
def fetch_tracks(search_query=None, limit=10, offset=0):
    if search_query is None:
        search_query = ""
    print(f"SEARCH QUERY: {search_query}")
    response = spotify.search(search_query, limit=limit, offset=offset, type="track")
    return response.get("tracks", {}).get("items", [])


def extract_tracks_data(track, audio_features=False):
    print(f"EXTRACTING TRACK FEATURES, AUDIO FEATURES = {audio_features}")
    artists = track.get("artists")[0]
    artist_id = artists.get("id")
    artist_info = spotify.artist(artist_id)
    artist_genres = artist_info.get("genres", [])

    track_dict = {
        "name": str(track.get("name")),
        "artists": str(artist_info.get("name")),
        "release_date": track.get("album", {}).get("release_date"),
        "genres": list(artist_genres),
        "common_genre": get_general_genre(artist_genres),
        "duration": int(track.get("duration_ms")),
        "popularity": int(track.get("popularity"))
    }

    if audio_features:
        track_audio_features = spotify.audio_features(track.get("uri"))[0]
        track_dict.update(**{key: float(track_audio_features.get(key)) for key in
           ["danceability", "energy", "key", "loudness", "speechiness", "acousticness", "instrumentalness",
            "liveness", "valence", "tempo"]})

    return track_dict


GENIUS CRAWLING UTILS

In [None]:
def fetch_lyrics(artist, song_title):
    # Format the artist and song title for the URL
    print(f"\nARTIST: {artist}")
    print(f"SONG TITLE: {song_title}")
    artist = purify_text(artist)
    song_title = purify_text(song_title)
    url = f"https://genius.com/{artist}-{song_title}-lyrics"

    print(f"Fetching: {url}")
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        print("I got something")
        # Use BeautifulSoup to parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the lyrics section
        lyrics_divs = soup.find_all('div', attrs={"data-lyrics-container": "true"})

        lyrics_lines = [d.get_text('\n') for d in lyrics_divs]

        return lyrics_lines

    print(f"Lyrics data not found")
    # Return None if lyrics couldn't be fetched
    return None

LYRICS FEATURES UTILS

In [None]:
def vers_chorus_count(song_lines):
    vers_count = 0
    chorus_count = 0
    for line in song_lines:
        line = line.strip()
        if line.startswith("[Chorus"):
            chorus_count += 1
        elif line.startswith("[Verse"):
            vers_count += 1

    return vers_count, chorus_count

def count_stop_words(lyrics):
    stopwords_list = set(stopwords.words('english'))
    return sum(1 for word in word_tokenize(lyrics) if word.lower() in stopwords_list)


def load_song_lyrics(text_file):
    with open(text_file, 'r') as tf:
        lyrics = tf.read()

    return lyrics


def count_words(lyrics, unique=False):
    return len(lyrics.split(" ")) if not unique else len(set(lyrics.split(" ")))


def average_word_length(lyrics):
    words = lyrics.split(" ")
    total_length = sum(len(word) for word in words)
    return total_length / len(words) if len(words) > 0 else 0


def sentiment_analysis(lyrics):
    sid = SentimentIntensityAnalyzer()
    sentiment = sid.polarity_scores(lyrics)
    return sentiment


def find_rhymes_count(lyrics):
    pronunciations = cmudict.dict()
    words = word_tokenize(lyrics.lower())
    unique_words = set(words)

    rhymes_count = 0
    for word in unique_words:
        try:
            phonemes = pronunciations[word]
        except KeyError:
            continue
        rhyming_words = []
        for rhyme_word, rhyme_phonemes in pronunciations.items():
            if rhyme_word != word and rhyme_phonemes[-2:] == phonemes[-2:]:
                rhyming_words.append(rhyme_word)
        rhymes_count += len(rhyming_words)

    return rhymes_count


def extract_lyrics_features(song_lyrics):

    lines = song_lyrics.split('\n')
    lines_count = len(lines)
    lyrics_no_new_lines = ' '.join(lines)
    word_count = count_words(lyrics_no_new_lines)
    unique_words = count_words(lyrics_no_new_lines, unique=True)
    avg_word_length = average_word_length(lyrics_no_new_lines)
    stopwords_count = count_stop_words(lyrics_no_new_lines)
    vers_cnt, chorus_cnt = vers_chorus_count(lines)
    sentiment = sentiment_analysis(lyrics_no_new_lines)
    # rhymes = find_rhymes_count(song_lyrics)
    # rhymes_perc = round(rhymes / word_count, 2)

    features = {
        "lines_count": lines_count,
        "word_count": word_count,
        "unique_words": unique_words,
        "stopwords_count":stopwords_count,
        "avg_word_length": avg_word_length,
        "chorus_count": vers_cnt,
        "verse_count": chorus_cnt,
        **sentiment
    }
    return features


def is_file_valid(file_path):
    if os.path.exists(file_path):
        file_size = os.stat(file_path).st_size
        return file_size > 0

    return False

DATA CURATION UTILS

In [None]:
def clean_dataset(df: pd.DataFrame):
    clean_df = df.copy()
    clean_df.dropna(inplace=True)
    clean_df.drop_duplicates(["name", "artists"],inplace=True)

    return clean_df

def repair_numeric_missing_vals(df, numeric_cols):
    repaired_vals_map = {col: df[col].median() for col in numeric_cols}
    repaired_df = df.fillna(value=repaired_vals_map)

    return repaired_df

PLOT UTILS

In [None]:
def one_dim_plot(sr, plot_type, axis):
    if plot_type == 'bar':
        axis.bar(sr.index, sr.values)
    elif plot_type == 'pie':
        axis.pie(sr.values, labels=sr.index, autopct='%1.1f%%')
    elif plot_type == 'line':
        axis.plot(sr.index, sr.values, marker='o')
    else:
        print("Invalid plot type. Please choose 'bar', 'pie', or 'line'.")


def get_frequent_elements(df, col_name, num_top_elements):
    return df[col_name].value_counts().nlargest(num_top_elements).sort_index()


def plot_frequent_elements(df, df_params):
    fig, axs = plt.subplots(1, len(df_params), figsize=(20, 5))
    for i, row in df_params.iterrows():
        col_name = row['col_name']
        plot_type = row['plot_type']
        num_top_elements = row['num_top_elements']
        sr = get_frequent_elements(df, col_name, num_top_elements)
        one_dim_plot(sr, plot_type, axs[i])
        axs[i].set_xlabel(col_name)
        axs[i].set_ylabel('Frequency')

FETCH SONGS DATA USING SPOTIFY API

In [None]:
def fetch_track_data(track_obj):
    track_data = extract_tracks_data(track_obj, audio_features=False)
    track_data["source_genre"] = genre
    lyrics_file = os.path.join("song_lyrics", f"{purify_text(track_data.get('artists'))}-{purify_text(track_data.get('name'))}.txt")

    # RENAME FILES FOR CONSISTENCY
    old_file_path = os.path.join("song_lyrics", f"{track_data.get('artists')}-{track_data.get('name')}.txt")
    if is_file_valid(old_file_path):
        os.rename(old_file_path, lyrics_file)


    if not is_file_valid(lyrics_file):
        print("LYRICS FILE IS NOT FOUND OR NOT VALID")
        # CRAWL GENIUS TO EXTRACT LYRICS TEXT FILE
        lyrics = fetch_lyrics(track_data.get('artists'), track_data.get('name'))
        lyrics_to_save = ''.join(lyrics) if lyrics else ""
        save_lyrics(lyrics_to_save, lyrics_file)

    lyrics_text = load_song_lyrics(lyrics_file)
    if lyrics_text:
        # EXTRACT LYRICS FEATURES
        lyrics_features = extract_lyrics_features(lyrics_text)
        track_data.update(lyrics_features)
    else:
        pass
        # print("FAILED TO LOAD SONG LYRICS, NOT SAVING")
        # TODO: dont save (continue) if lyrics not found

    return track_data


query = "year:2000-2023"
dataset = []
song_per_genre = 10
offset_range = int(song_per_genre/50) or 1
print(f"GOING TO FETCH {song_per_genre*len(GENRE_LIST)}")
for genre in GENRE_LIST:
    for offset in range(offset_range):
        limit = min(50, song_per_genre-50*offset)
        try:
            print(f"\nFETCHING {song_per_genre} {genre.upper()} SONGS, LIMIT: {limit}")
            res = fetch_tracks(query + f" genre:{genre}", limit=limit, offset=offset*50)
            if not res:
                print(f"FAILED TO FETCH {genre.upper()} SONGS")
                continue

            for index, track in enumerate(res):
                print(f"\nPARSING SONG DATA ({index+1})")
                data = fetch_track_data(track)
                dataset.append(data)
                print(f"SONG DATA SAVED")

        except Exception as e:
            print(f"FAILED FETCHING TRACKS: {e}")

save_dataset(dataset, "dataset_2.csv", True)

In [None]:
dataset = load_dataset("dataset.csv")
dataset.shape

CLEAN DATASET

In [None]:
clean = clean_dataset(dataset)
clean.shape

In [None]:
save_dataset(clean, "dataset.csv", overwrite=True)
clean.shape

EDA SECTION