In [20]:
import numpy as np
import pandas as pd
import os

cwd = os.getcwd()

df = pd.read_csv(os.path.join(cwd, "datasets", "updated_music.csv"))
movies = pd.read_csv(os.path.join(cwd, "datasets", "movies.csv"))

In [22]:
df.shape

shuffled_df = df.sample(frac=1, random_state=42)  # frac=1 means shuffling all rows, random_state for reproducibility
shuffled_df = shuffled_df.reindex(columns=df.columns)

In [23]:
shuffled_df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,movie_genre
113186,113186,6KwkVtXm8OUp2XffN5k7lY,Hillsong Worship,No Other Name,No Other Name,50,440247,False,0.369,0.598,...,1,0.0304,0.00511,0.0,0.176,0.0466,148.014,4,world-music,Biography
42819,42819,2dp5I5MJ8bQQHDoFaNRFtX,Internal Rot,Grieving Birth,Failed Organum,11,93933,False,0.171,0.997,...,1,0.118,0.00521,0.801,0.42,0.0294,122.223,4,grindcore,Horror
59311,59311,5avw06usmFkFrPjX8NxC40,Zhoobin Askarieh;Ali Sasha,Noise A Noise 20.4-1,"Save the Trees, Pt. 1",0,213578,False,0.173,0.803,...,0,0.144,0.613,0.00191,0.195,0.0887,75.564,3,iranian,Family
91368,91368,75hT0hvlESnDJstem0JgyR,Bryan Adams,All I Want For Christmas Is You,Merry Christmas,0,151387,False,0.683,0.511,...,1,0.0279,0.406,0.000197,0.111,0.598,109.991,3,rock,Action
61000,61000,4bY2oZGA5Br3pTE1Jd1IfY,Nogizaka46,バレッタ TypeD,月の大きさ,57,236293,False,0.555,0.941,...,0,0.0481,0.484,0.0,0.266,0.813,92.487,4,j-idol,Drama


In [24]:
X = shuffled_df.drop(['artists', 'album_name', 'track_name', 'explicit', 'track_genre', 'popularity', 'track_id', 'movie_genre', 'duration_ms', 'Unnamed: 0'], axis = 1)
y = shuffled_df['movie_genre']

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

# Assuming X is your input features and y is your target variable

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)  # Use first 70% for training

# Step 2: Train the Random Forest Regressor model on the training set
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust hyperparameters as needed
rf_classifier.fit(X_train, y_train)

# Step 3: Predict the output for the remaining 30% of rows (unseen data)
predictions = rf_classifier.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.31625730994152046


In [26]:
print(predictions)

['Comedy' 'Action' 'Biography' ... 'Thriller' 'Romance' 'Musical']


In [27]:
X.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
113186,0.369,0.598,7,-6.984,1,0.0304,0.00511,0.0,0.176,0.0466,148.014,4
42819,0.171,0.997,7,-3.586,1,0.118,0.00521,0.801,0.42,0.0294,122.223,4
59311,0.173,0.803,9,-10.071,0,0.144,0.613,0.00191,0.195,0.0887,75.564,3
91368,0.683,0.511,6,-5.598,1,0.0279,0.406,0.000197,0.111,0.598,109.991,3
61000,0.555,0.941,9,-3.294,0,0.0481,0.484,0.0,0.266,0.813,92.487,4


In [28]:
# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by feature importance (descending)
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [29]:
print(feature_importance_df)

             Feature  Importance
6       acousticness    0.111970
10             tempo    0.106327
0       danceability    0.106194
9            valence    0.104505
5        speechiness    0.103198
1             energy    0.102069
3           loudness    0.100979
8           liveness    0.093300
7   instrumentalness    0.081454
2                key    0.061753
4               mode    0.017540
11    time_signature    0.010710


In [30]:
import os
import re

import requests

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


os.environ["SPOTIPY_CLIENT_ID"] = "f5836f58dc294b01bcb8c900dc75bf2e"
os.environ["SPOTIPY_CLIENT_SECRET"] = "70784fe056534696ba70dfbd18a045bb"

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

playlistURL = input("Yo, put in your playlist URL, mayte")
playlistID =  playlistURL[34:56]
# https://open.spotify.com/playlist/37i9dQZF1EQnqst5TRi17F?si=b9e46d45123e454e
# Get the playlist tracks
results = sp.playlist_tracks(playlistID)

# Extract song information
tracks = results['items']
while results['next']:
    results = sp.next(results)
    tracks.extend(results['items'])

genres_playlist = {}

# Print out the track names
for track in tracks:
    track_id = track['track']['id']
    audio_features = sp.audio_features(track_id)[0]

    danceability = audio_features['danceability']
    energy = audio_features['energy']
    key = audio_features['key']
    loudness = audio_features['loudness']
    mode = audio_features['mode']
    speechiness = audio_features['speechiness']
    acousticness = audio_features['acousticness']
    instrumentalness = audio_features['instrumentalness']
    liveness = audio_features['liveness']
    valence = audio_features['valence']
    tempo = audio_features['tempo']
    time_signature = audio_features['time_signature']

    song_data = []
    song_data_key = []

    for key in audio_features:
        if key in('analysis_url', 'track_href', 'uri', 'id', 'type', 'duration_ms'): continue
        song_data.append(audio_features[key])
        song_data_key.append(key)

    genre = rf_classifier.predict(np.array(song_data).reshape(1, -1))

    if str(genre) in genres_playlist: genres_playlist[str(genre)] += 1
    else: genres_playlist[str(genre)] = 1



In [31]:
print(genres_playlist)

{"['Sci-Fi']": 9, "['Musical']": 5, "['Action']": 22, "['Drama']": 9, "['Romance']": 4, "['History']": 1}


In [44]:
top_3_genres = sorted(genres_playlist, key=genres_playlist.get, reverse=True)[:3]

print(top_3_genres)

for genre in top_3_genres:
    movies = pd.read_csv(os.path.join(cwd, "datasets", "splitted", f"{genre}.csv"))
    # sub_df = movies[movies['genre'].str.contains(genre[2:-2])]

    movie_rows = movies.sample(n=5)

    movie_names = []

    for index, row in movie_rows.iterrows():
        movie_names.append(row['movie_name'])

    print(movie_names)

["['Action']", "['Sci-Fi']", "['Drama']"]
['The Homecoming of Jimmy Whitecloud', 'Acorralado', 'The Magnificent Butcher', 'M.M.M. 83', 'Black Site']
['Sci-Fighter', 'Alien Theory', 'Caçadores de Espécies e o Símbolo Secreto', 'Time Twister', 'Hide and Seek']
['Eri Naam Dosti', 'Scarlight', 'Recreation', 'Survival Zone', 'Les égouts du paradis']


In [33]:
track_name = "Stay In My Corner"

results = sp.search(q='track:' + track_name, type='track')

track = results['tracks']['items'][0]

# Extract track ID
track_id = track['id']

# Get audio features of the track
audio_features = sp.audio_features(track_id)[0]

# Extract audio features
danceability = audio_features['danceability']
energy = audio_features['energy']
key = audio_features['key']
loudness = audio_features['loudness']
mode = audio_features['mode']
speechiness = audio_features['speechiness']
acousticness = audio_features['acousticness']
instrumentalness = audio_features['instrumentalness']
liveness = audio_features['liveness']
valence = audio_features['valence']
tempo = audio_features['tempo']
time_signature = audio_features['time_signature']

song_data = []
song_data_key = []

for key in audio_features:
    if key in('analysis_url', 'track_href', 'uri', 'id', 'type', 'duration_ms'):
        continue
    else:
        song_data.append(audio_features[key])
        song_data_key.append(key)

print(song_data_key)
print(song_data)

genre = rf_classifier.predict(np.array(song_data).reshape(1, -1))
print(type(genre))

['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
[0.437, 0.689, 11, -6.003, 0, 0.0343, 0.0409, 0.00824, 0.147, 0.467, 143.451, 4]
<class 'numpy.ndarray'>


