## Data Preparation

In [12]:
# Import required libraries 
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler,OneHotEncoder

# Display all columns 
pd.set_option('display.max_columns', 200)

In [13]:
# Open the CSV file in a Pandas DataFrame
spotify_df = pd.read_csv(Path("../Resources/spotify_data.csv"))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
# dropping null columns in dataframe - these columns were created by the creators of this data using thier NLP 
spotify_df = spotify_df[spotify_df.isnull().sum()[spotify_df.isnull().sum()==0].keys()].copy()

In [15]:
# Drop unnecessary/irrelevant/redundant columns
spotify_df = spotify_df.drop(columns=['Album/Single', 'Genre', 'Genre_new', 'Explicit', 'Album', 'Release_date', 'Cluster', 'Popu_max', 'Popularity', 'Cluster'])

spotify_df.head()

Unnamed: 0,Country,Uri,Title,Artist,Track_number,Tracks_in_album,danceability,energy,key,loudness,mode,speechiness,acoustics,instrumentalness,liveliness,valence,tempo,duration_ms,time_signature,Explicit_false,Explicit_true,album,compilation,single,bolero,boy band,country,dance/electronic,else,funk,hip hop,house,indie,jazz,k-pop,latin,metal,opm,pop,r&b/soul,rap,reggae,reggaeton,rock,trap,bing_norm_negative,bing_norm_neutral,bing_norm_positive,Argentina,Australia,Austria,Belgium,Brazil,Canada,Chile,Colombia,Costa Rica,Denmark,Ecuador,Finland,France,Germany,Global,Indonesia,Ireland,Italy,Malaysia,Mexico,Netherlands,New Zealand,Norway,Peru,Philippines,Poland,Portugal,Singapore,Spain,Sweden,Switzerland,Turkey,UK,USA,Top10_dummy,Top50_dummy
0,Global,https://open.spotify.com/track/6FyRXC8tJUh863J...,adan y eva,Paulo Londra,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,258639,4,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
1,USA,https://open.spotify.com/track/6FyRXC8tJUh863J...,adan y eva,Paulo Londra,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,258639,4,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Argentina,https://open.spotify.com/track/6FyRXC8tJUh863J...,adan y eva,Paulo Londra,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,258639,4,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3,Belgium,https://open.spotify.com/track/6FyRXC8tJUh863J...,adan y eva,Paulo Londra,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,258639,4,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Switzerland,https://open.spotify.com/track/6FyRXC8tJUh863J...,adan y eva,Paulo Londra,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,258639,4,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [16]:
# save it to a csv file
spotify_df.to_csv("../Resources/spotify_data_prepared.csv")

## Extracting US data

In [17]:
# drop the Uri from the dataset
spotify_df = spotify_df.drop(columns=['Uri'])

# selecting US data from original dataframe
spotify_us_df = spotify_df[spotify_df['Country'] == "USA"].copy()

# drop the Country and bing columns
spotify_us_df = spotify_us_df.drop(columns = ["Country", "bing_norm_negative", "bing_norm_neutral", "bing_norm_positive"]).copy()

# view the dataframe
spotify_us_df.head()

Unnamed: 0,Title,Artist,Track_number,Tracks_in_album,danceability,energy,key,loudness,mode,speechiness,acoustics,instrumentalness,liveliness,valence,tempo,duration_ms,time_signature,Explicit_false,Explicit_true,album,compilation,single,bolero,boy band,country,dance/electronic,else,funk,hip hop,house,indie,jazz,k-pop,latin,metal,opm,pop,r&b/soul,rap,reggae,reggaeton,rock,trap,Argentina,Australia,Austria,Belgium,Brazil,Canada,Chile,Colombia,Costa Rica,Denmark,Ecuador,Finland,France,Germany,Global,Indonesia,Ireland,Italy,Malaysia,Mexico,Netherlands,New Zealand,Norway,Peru,Philippines,Poland,Portugal,Singapore,Spain,Sweden,Switzerland,Turkey,UK,USA,Top10_dummy,Top50_dummy
1,adan y eva,Paulo Londra,1,1,0.767,0.709,1,-4.47,1,0.336,0.323,0.0,0.0676,0.72,171.993,258639,4,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
17,it wont kill ya,The Chainsmokers - Louane,7,12,0.572,0.53,6,-8.521,0,0.0654,0.0647,0.000169,0.127,0.12,170.138,217613,4,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
40,hymn,Kesha,1,1,0.488,0.538,6,-4.974,1,0.076,0.147,0.0,0.305,0.38,67.037,205600,4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
59,arrows,Foo Fighters,7,11,0.515,0.917,1,-7.312,1,0.0417,0.000209,0.000476,0.145,0.409,121.988,266187,4,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
81,talking to myself,Linkin Park,3,10,0.593,0.712,2,-6.325,0,0.0286,0.00184,0.0,0.128,0.473,124.013,231307,4,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [18]:
# save it to a csv file
spotify_df.to_csv("../Resources/spotify_us_df.csv")