DATA CRAWLING

In [43]:
import re

import matplotlib.pyplot as plt
import seaborn as sns
import csv
import json
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
import nltk

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('cmudict')
# nltk.download('vader_lexicon')

from nltk.corpus import cmudict
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

SPOTIPY_CLIENT_ID = "e4d756536f474fdf8fdee07780ea76d1"
SPOTIPY_CLIENT_SECRET = "b75eb5a3ebb14541b61e77626285cfd8"
spotify = spotipy.Spotify(
    client_credentials_manager=SpotifyClientCredentials(
        client_id=SPOTIPY_CLIENT_ID,
        client_secret=SPOTIPY_CLIENT_SECRET
    )
)

GENERAL UTILS

In [52]:
def clean_string(string):
    return re.sub(r'[^a-zA-Z0-9\s-]', '', string)


def save_lyrics(txt, file_path):
    with open(file_path, 'w') as file:
        file.write(txt)


def get_file_extension(filepath):
    # Get the file extension from the file path
    _, file_extension = os.path.splitext(filepath)

    # Check if the file extension is supported
    if file_extension not in ['.json', '.csv']:
        raise ValueError("Unsupported file format. Only 'json' and 'csv' formats are supported.")

    return file_extension


def save_dataset(data, filepath, overwrite=False):
    file_extension = get_file_extension(filepath)

    # Check if the file already exists and overwrite is not enabled
    if os.path.exists(filepath) and not overwrite:
        raise FileExistsError("File already exists. Set 'overwrite' to True to overwrite the file.")

    # Save data as JSON
    if file_extension == '.json':
        if isinstance(data, pd.DataFrame):
            data.to_json(filepath, orient='records')

        else:
            with open(filepath, 'w') as file:
                json.dump(data, file)

    # Save data as CSV
    elif file_extension == '.csv':
        if isinstance(data, pd.DataFrame):
            data.to_csv(filepath, index=False)

        else:
            with open(filepath, 'w', newline='') as file:
                writer = csv.writer(file)
                # Assuming the dictionary has consistent keys across all entries
                writer.writerow(data[0].keys())  # Write header
                writer.writerows([entry.values() for entry in data])

    print(f"Data successfully saved as '{filepath}'.")


def load_dataset(filepath):
    file_extension = get_file_extension(filepath)

    # Load data from JSON
    if file_extension == '.json':
        with open(filepath, 'r') as file:
            data = json.load(file)

    # Load data from CSV
    elif file_extension == '.csv':
        with open(filepath, 'r', newline='') as file:
            reader = csv.DictReader(file)
            data = [dict(row) for row in reader]

    else:
        return

    # Convert data to DataFrame
    df = pd.DataFrame(data)
    return df


SPOTIFY UTILS

In [3]:
def fetch_tracks(search_query=None, limit=10, offset=0):
    if search_query is None:
        search_query = ""
    print(f"search_query: {search_query}")
    response = spotify.search(search_query, limit=limit, offset=offset, type="track")
    return response.get("tracks", {}).get("items", [])


def extract_tracks_data(track):
    track_uri = track.get("uri")
    artists = track.get("artists")[0]
    artist_id = artists.get("id")
    artist_info = spotify.artist(artist_id)
    artist_genres = artist_info.get("genres", [])
    track_audio_features = spotify.audio_features(track_uri)[0]
    track_dict = {
        "artists": artist_info.get("name"),
        "genres": artist_genres,
        "release_date": track.get("album", {}).get("release_date"),
        **{key: track.get(key) for key in
           ["name", "popularity", "uri", "duration_ms"]},
        **{key: track_audio_features.get(key) for key in
           ["danceability", "energy", "key", "loudness", "speechiness", "acousticness", "instrumentalness",
            "liveness", "valence", "tempo"]}
    }
    return track_dict


GENIUS CRAWLING UTILS

In [58]:
def fetch_lyrics(artist, song_title):
    # Format the artist and song title for the URL
    print(f"\nARTIST: {artist}")
    print(f"SONG TITLE: {song_title}")
    artist = clean_string(artist).lower().replace(" ", "-")
    song_title = clean_string(song_title).lower().replace(" ", "-")
    url = f"https://genius.com/{artist}-{song_title}-lyrics"

    print(f"Fetching: {url}")
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        print("I got something")
        # Use BeautifulSoup to parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the lyrics section
        lyrics_divs = soup.find_all('div', attrs={"data-lyrics-container": "true"})

        lyrics_lines = [d.get_text('\n') for d in lyrics_divs]

        return lyrics_lines

    # Return None if lyrics couldn't be fetched
    return None

LYRICS FEATURES UTILS

In [59]:

def load_song_lyrics(text_file):
    with open(text_file, 'r') as tf:
        lyrics = tf.read()

    return lyrics


def count_words(lyrics):
    words = word_tokenize(lyrics)
    return len(words)


def average_word_length(lyrics):
    words = word_tokenize(lyrics)
    total_length = sum(len(word) for word in words)
    return total_length / len(words) if len(words) > 0 else 0


def sentiment_analysis(lyrics):
    sid = SentimentIntensityAnalyzer()
    sentiment = sid.polarity_scores(lyrics)
    return sentiment


def find_rhymes_count(lyrics):
    pronunciations = cmudict.dict()
    words = word_tokenize(lyrics.lower())
    unique_words = set(words)

    rhymes_count = 0
    for word in unique_words:
        try:
            phonemes = pronunciations[word]
        except KeyError:
            continue
        rhyming_words = []
        for rhyme_word, rhyme_phonemes in pronunciations.items():
            if rhyme_word != word and rhyme_phonemes[-2:] == phonemes[-2:]:
                rhyming_words.append(rhyme_word)
        rhymes_count += len(rhyming_words)

    return rhymes_count


def extract_lyrics_info(song_lyrics):
    lyrics_info = []
    verse_num = 0
    chorus_num = 0
    stopwords_list = set(stopwords.words('english'))

    lines = song_lyrics.split('\n')
    lines_count = len(lines)
    lyrics_text = ' '.join(lines)  # Combine all lines into a single string

    word_count = count_words(lyrics_text)
    avg_word_length = average_word_length(lyrics_text)
    # sentiment = sentiment_analysis(lyrics_text)
    rhymes = find_rhymes_count(song_lyrics)
    rhymes_perc = round(rhymes / word_count, 2)
    stopwords_count = sum(1 for word in word_tokenize(lyrics_text) if word.lower() in stopwords_list)

    for line in lines:
        line = line.strip()
        if line.startswith("[Chorus"):
            chorus_num += 1
        elif line.startswith("[Verse"):
            verse_num += 1

    lyrics_info.append(
        [lines_count, word_count, avg_word_length, rhymes, rhymes_perc, chorus_num, verse_num, stopwords_count])
    return lyrics_info

DATA CURATION UTILS

In [60]:
def clean_dataset(df: pd.DataFrame):
    clean_df = df.copy()
    clean_df.dropna(inplace=True)
    clean_df.drop_duplicates(inplace=True)

    return clean_df

def repair_numeric_missing_vals(df, numeric_cols):
    repaired_vals_map = {col: df[col].median() for col in numeric_cols}
    repaired_df = df.fillna(value=repaired_vals_map)

    return repaired_df

PLOT UTILS

In [61]:
def one_dim_plot(sr, plot_type, axis):
    if plot_type == 'bar':
        axis.bar(sr.index, sr.values)
    elif plot_type == 'pie':
        axis.pie(sr.values, labels=sr.index, autopct='%1.1f%%')
    elif plot_type == 'line':
        axis.plot(sr.index, sr.values, marker='o')
    else:
        print("Invalid plot type. Please choose 'bar', 'pie', or 'line'.")


def get_frequent_elements(df, col_name, num_top_elements):
    return df[col_name].value_counts().nlargest(num_top_elements).sort_index()


def plot_frequent_elements(df, df_params):
    fig, axs = plt.subplots(1, len(df_params), figsize=(20, 5))
    for i, row in df_params.iterrows():
        col_name = row['col_name']
        plot_type = row['plot_type']
        num_top_elements = row['num_top_elements']
        sr = get_frequent_elements(df, col_name, num_top_elements)
        one_dim_plot(sr, plot_type, axs[i])
        axs[i].set_xlabel(col_name)
        axs[i].set_ylabel('Frequency')

FETCH SONGS DATA USING SPOTIFY API

In [None]:
query = "year:2000-2023"
genres = [
    "Pop",
    "Rock",
    "Hip-Hop/Rap",
    "R&B/Soul",
    "Electronic/Dance",
    "Country",
    "Jazz",
    "Classical",
    "Reggae",
    "Alternative",
    "Indie",
    "Folk",
    "Metal",
    "Punk",
    "Blues",
    "Latin",
    "World",
    "Funk",
    "Disco",
    "Gospel"
]

dataset = []
for offset in range(10):
    for genre in genres:
        try:
            res = fetch_tracks(query + f" genre:{genre}", 50, offset=offset*50)
            for track in res:
                data = extract_tracks_data(track)
                dataset.append(data)
                print(f"{data}\n")
        except Exception as e:
            print(f"FAILED FETCHING TRACKS: {e}")

save_dataset(dataset, "genRecoBigDataset.csv", False)

search_query: year:2000-2023 genre:Pop
{'artists': 'PinkPantheress', 'genres': ['bedroom pop'], 'release_date': '2023-02-03', 'name': "Boy's a Liar Pt. 2", 'popularity': 95, 'uri': 'spotify:track:6AQbmUe0Qwf5PZnt4HmTXv', 'duration_ms': 131013, 'danceability': 0.696, 'energy': 0.809, 'key': 5, 'loudness': -8.254, 'speechiness': 0.05, 'acousticness': 0.252, 'instrumentalness': 0.000128, 'liveness': 0.248, 'valence': 0.857, 'tempo': 132.962}

{'artists': 'Taylor Swift', 'genres': ['pop'], 'release_date': '2019-08-23', 'name': 'Cruel Summer', 'popularity': 95, 'uri': 'spotify:track:1BxfuPKGuaTgP7aM0Bbdwr', 'duration_ms': 178426, 'danceability': 0.552, 'energy': 0.702, 'key': 9, 'loudness': -5.707, 'speechiness': 0.157, 'acousticness': 0.117, 'instrumentalness': 2.06e-05, 'liveness': 0.105, 'valence': 0.564, 'tempo': 169.994}

{'artists': 'FIFTY FIFTY', 'genres': ['k-pop girl group'], 'release_date': '2023-02-24', 'name': 'Cupid - Twin Ver.', 'popularity': 99, 'uri': 'spotify:track:7FbrGaHY

In [69]:
dataset = load_dataset("genRecoBigDataset.csv")
dataset.shape

(6732, 17)

CLEAN DATASET

In [70]:
clean = clean_dataset(dataset)
clean.shape

(5590, 17)

In [71]:
clean.head()

Unnamed: 0,artists,genres,release_date,name,popularity,uri,duration_ms,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,PinkPantheress,['bedroom pop'],2023-02-03,Boy's a Liar Pt. 2,95,spotify:track:6AQbmUe0Qwf5PZnt4HmTXv,131013,0.696,0.809,5,-8.254,0.05,0.252,0.000128,0.248,0.857,132.962
1,Taylor Swift,['pop'],2019-08-23,Cruel Summer,95,spotify:track:1BxfuPKGuaTgP7aM0Bbdwr,178426,0.552,0.702,9,-5.707,0.157,0.117,2.06e-05,0.105,0.564,169.994
2,FIFTY FIFTY,['k-pop girl group'],2023-02-24,Cupid - Twin Ver.,99,spotify:track:7FbrGaHYVDmfr7KoLIZnQ7,174253,0.783,0.592,11,-8.332,0.0331,0.435,4.15e-06,0.347,0.726,120.018
3,Taylor Swift,['pop'],2022-10-21,Anti-Hero,95,spotify:track:0V3wPSX9ygBnCm8psDIegu,200690,0.637,0.643,4,-6.571,0.0519,0.13,1.8e-06,0.142,0.533,97.008
4,Drake,"['canadian hip hop', 'canadian pop', 'hip hop'...",2023-04-07,Search & Rescue,91,spotify:track:7aRCf5cLOFN1U7kvtChY1G,272112,0.817,0.44,10,-8.482,0.0734,0.0603,1.42e-06,0.33,0.544,142.024


ENRICH DATASET

In [74]:
for index, row in clean.iterrows():
    artist = row["artists"]
    track_name = row["name"]
    file_path = f"song_lyrics/{artist}-{track_name}.txt"
    if os.path.exists(file_path):
        print("File exists, skipping.")
        continue

    lyr = fetch_lyrics(artist, track_name)
    if lyr:
        save_lyrics(lyr[0], file_path)
    else:
        print(f"FAILED TO FETCH LYRICS FOR: {track_name} BY {artist}")

File exists, skipping.
File exists, skipping.

ARTIST: FIFTY FIFTY
SONG TITLE: Cupid - Twin Ver.
Fetching: https://genius.com/fifty-fifty-cupid---twin-ver-lyrics
FAILED TO FETCH LYRICS FOR: Cupid - Twin Ver. BY FIFTY FIFTY
File exists, skipping.

ARTIST: Drake
SONG TITLE: Search & Rescue
Fetching: https://genius.com/drake-search--rescue-lyrics
FAILED TO FETCH LYRICS FOR: Search & Rescue BY Drake
File exists, skipping.
File exists, skipping.
File exists, skipping.
File exists, skipping.

ARTIST: Rema
SONG TITLE: Calm Down (with Selena Gomez)
Fetching: https://genius.com/rema-calm-down-with-selena-gomez-lyrics
FAILED TO FETCH LYRICS FOR: Calm Down (with Selena Gomez) BY Rema
File exists, skipping.

ARTIST: Drake
SONG TITLE: Rich Flex
Fetching: https://genius.com/drake-rich-flex-lyrics
FAILED TO FETCH LYRICS FOR: Rich Flex BY Drake
File exists, skipping.
File exists, skipping.

ARTIST: Drake
SONG TITLE: Jimmy Cooks (feat. 21 Savage)
Fetching: https://genius.com/drake-jimmy-cooks-feat-21-s

FileNotFoundError: [Errno 2] No such file or directory: 'song_lyrics/Radiohead-Weird Fishes/ Arpeggi.txt'

In [75]:
print(len(os.listdir("song_lyrics")))

2671


EDA SECTION