### Popmodel EDA
###### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

import sweetviz as sv

from sklearn.preprocessing import StandardScaler, OrdinalEncoder

###### Notebook Settings

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

sns.set_style("whitegrid")

plt.rcParams.update({'figure.figsize': (12.0, 12.0)})
plt.rcParams.update({'font.size': 14})

###### Feature Engineering

In [None]:
#Read CSVs
data = pd.read_csv("SpotifyData.csv")
add_infos = pd.read_csv("additional_infos.csv")

In [None]:
#Adjust inconsistent genre naming
data.loc[data["genre"] == 'Children’s Music', "genre"] = "Children Music"
data.loc[data["genre"] == "Children's Music", "genre"] = "Children Music"

In [None]:
#Join duplicate tracks assigned to different genres together in one common row
genre_df = data.groupby(['track_id'])['genre'].apply(', '.join).reset_index()

In [None]:
#Drop duplicates by track ID and remove ID column
data.drop_duplicates("track_id", inplace=True)
data.sort_values("track_id", inplace=True)
data.reset_index(inplace=True, drop=True)

In [None]:
#Assign Features
X = data

In [None]:
#Replace genre column in original dataset
X["genre"] = genre_df["genre"]

In [None]:
#Genre encoding seperate since there can be multiple values in one row
X = pd.concat([X, X['genre'].str.get_dummies(sep=', ')], axis=1)
X = pd.concat([X, pd.get_dummies(X[["key","mode","time_signature"]])], axis=1)
X.drop(["key","mode","time_signature","genre"], axis=1, inplace=True)

In [None]:
#Add explicit feature from external dataset
add_infos.sort_values("id", inplace=True)
assert pd.Series(add_infos["id"].values == X["track_id"].values).value_counts().values[0] == len(add_infos)
X["explicit"] = add_infos["explicit"]
X["track_number"] = add_infos["track_number"]

In [None]:
#Get release Date from String and create new feature
temp = []
for i in range(len(add_infos)):
    temp += [int(add_infos["album"][i][add_infos["album"][i].find("release_date") + 16: add_infos["album"][i].find("release_date") + 20])]
X["release_date"] = temp
X.loc[X["release_date"] == 0000, "release_date"] = X["release_date"].values.mean().round()

In [None]:
#Get total track number from String and create new feature
temp = []
for i in range(len(add_infos)):
    if add_infos["album"][i][add_infos["album"][i].find("total_tracks") + 16] == ",":
        temp += [int(add_infos["album"][i][add_infos["album"][i].find("total_tracks") + 15: add_infos["album"][i].find("total_tracks") + 16])]
    elif add_infos["album"][i][add_infos["album"][i].find("total_tracks") + 17] == ",":
        temp += [int(add_infos["album"][i][add_infos["album"][i].find("total_tracks") + 15: add_infos["album"][i].find("total_tracks") + 17])]
    else:
        temp += [int(add_infos["album"][i][add_infos["album"][i].find("total_tracks") + 15: add_infos["album"][i].find("total_tracks") + 18])]
X["total_tracks"] = temp

In [None]:
#Get artist id from String and create new feature
temp = []
for i in range(len(add_infos)):
    temp += [add_infos["album"][i][add_infos["album"][i].find("id") + 6: add_infos["album"][i].find("id") + 28]]
X["artist_id"] = temp

In [None]:
#encoding track id and artist
lenc = OrdinalEncoder()

X["artist_id"] = lenc.fit_transform(X["artist_id"].values.reshape(-1,1))
X["track_name"] = lenc.fit_transform(X["track_name"].values.reshape(-1,1))
X["artist_name"] = lenc.fit_transform(X["artist_name"].values.reshape(-1,1))
X["genre"] = lenc.fit_transform(X["genre"].values.reshape(-1,1))
X["key"] = lenc.fit_transform(X["key"].values.reshape(-1,1))
X["mode"] = lenc.fit_transform(X["mode"].values.reshape(-1,1))
X["time_signature"] = lenc.fit_transform(X["time_signature"].values.reshape(-1,1))

In [None]:
X.select_dtypes("object")

In [None]:
#Scaling all features
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
#SweeViz
my_report = sv.analyze(data)
my_report.show_html()

In [None]:
#SweeViz
my_report = sv.analyze(X)
my_report.show_html()

##### EDA

In [None]:
corr = data.corr()
ax = sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
cols = X.select_dtypes("object").columns
X[cols] = X[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [None]:
X

In [None]:
X.groupby('genre').unstack()

In [None]:
df1 = X.groupby(["genre"])["popularity"]\
    .mean().reset_index(name='sum')#.pivot("genre","mode","sum").sort_index(ascending=False)
df1.sort_values("sum").transpose()#.to_csv("pop_genre.csv")



In [None]:
df1 = X.groupby(["genre"])["popularity"]\
    .max().reset_index(name='sum')#.pivot("genre","mode","sum").sort_index(ascending=False)
df1 = df1.sort_values("sum").transpose()#.to_csv("pop_genre.csv")
df1


In [None]:

sns.barplot(df1)


In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

# Start with one review:

text = " ".join(word for word in data[data["genre"] == "Pop"].track_name)

# Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["Remix", "Remastered", "feat", "Version", "Remaster", "Acoustic", "Flat Major", "Instrumental"])

mask = np.array(Image.open("spotify.png"))

# for i in range(len(mask)):
#     transformed_mask[i] = list(map(transform_format(mask[i])))


# Create a word cloud image
wc = WordCloud(background_color="black", max_words=10000, mask=mask,
               stopwords=stopwords#, contour_width=10, contour_color=(0,176,129)
               )

# Generate a wordcloud
wc.generate(text)

# store to file
wc.to_file("Pop.png")

# show
plt.figure(figsize=[20,10])
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
