IMPORT SECTION

In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('cmudict')
# nltk.download('vader_lexicon')
# nltk.download('words')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [3]:
from utils.curation_utils import transfer_to_categorical, remove_duplicates_and_drop_na, repair_numeric_missing_vals, \
    outlier_detection_iqr, my_dist_to_avg
from utils.plot_utils import plot_frequent_elements, plot_cross_tabulation, get_highly_correlated_cols, \
    transfer_str_to_numeric_vals
from utils.spotify_utils import fetch_tracks, fetch_track_data
from utils.general_utls import GENRE_LIST, save_dataset, load_dataset

FETCH SONGS DATA USING SPOTIFY API

In [4]:
dataset_file = "dataset_test.csv"
query = "year:1990-2023"
song_per_genre = 1
offset_range = int(song_per_genre/50) or 1
print(f"GOING TO FETCH {song_per_genre*len(GENRE_LIST)} TRACKS")

GOING TO FETCH 22 TRACKS


In [5]:
dataset = []
for genre in GENRE_LIST:
    for offset in range(offset_range):
        limit = min(50, song_per_genre-50*offset)
        try:
            print(f"\nFETCHING {song_per_genre} {genre.upper()} SONGS, LIMIT: {limit}")
            res = fetch_tracks(query + f" genre:{genre}", limit=limit, offset=offset*50)
            if not res:
                print(f"FAILED TO FETCH {genre.upper()} SONGS")
                continue

            for index, track in enumerate(res):
                print(f"\nPARSING SONG DATA ({index+1})")
                data = {"source_genre": genre}
                track_data = fetch_track_data(track)
                data.update(track_data)
                dataset.append(data)
                print(f"SONG DATA SAVED")

        except Exception as e:
            print(f"FAILED FETCHING TRACKS: {e}")

save_dataset(dataset, dataset_file, False)


FETCHING 1 POP SONGS, LIMIT: 1
SEARCH QUERY: year:1990-2023 genre:pop

PARSING SONG DATA (1)
EXTRACTING TRACK FEATURES, AUDIO FEATURES = False
INIT HANDLER FOR FILE: song_lyrics/taylor-swift-cruel-summer.txt
EXTRACTED FEATURES:
{'line_cnt': 79, 'word_cnt': 467, 'unique_words_ratio': 0.302, 'stop_words_ratio': 0.445, 'slang_words_ratio': 0.013, 'intro_cnt': 1, 'outro_cnt': 1, 'verse_cnt': 2, 'chorus_cnt': 2, 'neg': 0.171, 'neu': 0.646, 'pos': 0.183, 'compound': -0.8188}
SONG DATA SAVED

FETCHING 1 ROCK SONGS, LIMIT: 1
SEARCH QUERY: year:1990-2023 genre:rock

PARSING SONG DATA (1)
EXTRACTING TRACK FEATURES, AUDIO FEATURES = False
INIT HANDLER FOR FILE: song_lyrics/the-neighbourhood-sweater-weather.txt
EXTRACTED FEATURES:
{'line_cnt': 65, 'word_cnt': 356, 'unique_words_ratio': 0.374, 'stop_words_ratio': 0.528, 'slang_words_ratio': 0.02, 'intro_cnt': 0, 'outro_cnt': 0, 'verse_cnt': 2, 'chorus_cnt': 3, 'neg': 0.064, 'neu': 0.814, 'pos': 0.123, 'compound': 0.9829}
SONG DATA SAVED

FETCHING 

In [None]:
# df = load_dataset(dataset_file)
# recalculate_dataset(df)
# save_dataset(df, dataset_file, True)

In [None]:
dataset = load_dataset(dataset_file)
print(f"SHAPE: {dataset.shape}\n")
dataset.info()

CLEAN DATASET

In [None]:
dup_na_removed = remove_duplicates_and_drop_na(dataset)
outliers = outlier_detection_iqr(dup_na_removed, my_dist_to_avg)
repaired = repair_numeric_missing_vals(outliers, dup_na_removed.select_dtypes('number'))
repaired.info()

In [None]:
pd.plotting.scatter_matrix(repaired, figsize = (20,20))

In [None]:
numeric_to_bin_dict = {
    "duration": [0, 150000, 300000, 450000, 600000, 750000],
    "popularity": [0, 20, 40, 60, 80, 100],
    "line_cnt": [0, 50, 100, 150, 200, 250],
    "word_cnt": [0, 300, 600, 900, 1200, 1500],
    "unique_words": [0, 100, 200, 300, 400, 500],
    "stopwords_perc": [-1, 150, 300, 450, 600, 750],
}

categorical_cols = ['chorus_count', 'verse_count']
transferred = transfer_to_categorical(repaired, numeric_to_bin_dict, categorical_cols)

In [None]:
transferred.info()

EDA SECTION

In [None]:
df_params = pd.DataFrame({'plot_type': ['bar', 'line', 'pie'],
                          'col_name': ['common_genre', 'release_date', 'artists'],
                          'num_top_elements': [6,6,6]})
plot_frequent_elements(transferred, df_params)

In [None]:
plt.rcParams["figure.figsize"] = (18,6)

In [None]:
print("\nCommon Genre Distribution:")
sns.countplot(x='common_genre', data=transferred)
plt.show()

In [None]:
numerical_cols = ['duration','popularity', 'lines_count', 'word_count', 'unique_words',
                  'stopwords_count']
for col in numerical_cols:
    sns.histplot(transferred[col])
    plt.title(col + " Distribution")
    plt.show()

In [None]:
sentiment_cols = ['neg', 'neu', 'pos', 'compound']
for col in sentiment_cols:
    sns.histplot(transferred[col])
    plt.title(col + " Distribution")
    plt.show()

In [None]:
# cross_tabulation(transferred, 'release_date', 'popularity')
plot_cross_tabulation(transferred, ['release_date', 'popularity'], 'common_genre')

In [None]:
correlations, tuple_arr = get_highly_correlated_cols(transferred)

In [None]:
cols_for_correlations = transferred.columns
indx_sort = np.argsort(correlations)
for n_correlation in indx_sort:
    col_lt, col_rt = tuple_arr[n_correlation]
    col_name_lt, col_name_rt = cols_for_correlations[col_lt], cols_for_correlations[col_rt]
    title = "corr('%s', '%s')=%4.2f" %(col_name_lt, col_name_rt, correlations[n_correlation])
    print(title)

In [None]:
transferred = transfer_str_to_numeric_vals(transferred)
transferred.info()

In [None]:
X_train, X_test, y_train, y_test = split_to_train_and_test(transferred, 'common_genre', 0.2, 5)

params = {'n_neighbors':47}
clf = get_classifier_obj("KNN",params)
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)

In [None]:
accuracy_val = calc_evaluation_val("accuracy", y_test, y_predicted)
print(accuracy_val)
# precision_val = calc_evaluation_val("precision", y_test, y_predicted)
# print(precision_val)
# recall_val = calc_evaluation_val("recall", y_test, y_predicted)
# print(recall_val)
# f1_val = calc_evaluation_val("f1", y_test, y_predicted)
# print(f1_val)
# confusion_matrix_val = calc_evaluation_val("confusion_matrix", y_test, y_predicted)