In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

In [2]:
def export_classifier_df_to_csv(df: pd.DataFrame, filename: str):
    df.to_csv("../data/" + filename)

In [3]:
# prepare new data for hypothesis testing
def add_to_classifier_df(df, classifier_name, prediction_results):
    df[classifier_name] = np.array(prediction_results)

def export_classifier_to_df(classifier_dataframe: pd.DataFrame, name: str):
    classifier_dataframe.to_csv('../data/' + name)

## APPROACH 1: Predict rank OF top 200 chart FROM viral 50 rank 
## (from viral 50 rank -> top 200 rank)
The goal of predicting the rank of the top 200 chart from the viral 50 rank is to use the popularity of a song on the viral chart as a means of predicting how well it will perform on the overall top 200 chart.

In [4]:
df_weekly_ranks = pd.read_csv("../data/weekly_ranks.csv")
df_weekly_ranks = df_weekly_ranks.drop(columns=['trend_viral_50'])
df_weekly_ranks.drop("Unnamed: 0",axis=1,inplace=True)
df_weekly_ranks.sample(10)

Unnamed: 0,title,date_viral,viral_50_rank,date_top200,top_200_rank
174,P*$$Y Fairy (OTW),2020-03-01,35,2020-03-08,175
1028,Freaks,2021-04-18,47,2021-04-25,152
1131,As the World Caves In,2021-06-06,21,2021-06-13,113
94,The Box,2020-01-26,2,2020-02-02,1
237,Sunday Best,2020-03-22,17,2020-03-29,15
916,telepatía,2021-02-21,6,2021-02-28,4
392,Roses - Imanbek Remix,2020-05-24,22,2020-05-31,13
953,Cloud 9,2021-03-14,24,2021-03-21,122
1095,4 Da Gang (with Roddy Ricch),2021-05-16,44,2021-05-23,50
618,Midnight Sky,2020-09-13,27,2020-09-20,46


In [5]:
df_classifier_approach_1 = pd.DataFrame()

In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(df_weekly_ranks["viral_50_rank"].values,df_weekly_ranks["top_200_rank"].values, train_size=0.8)
xtrain = xtrain.reshape(-1,1)
xtest = xtest.reshape(-1,1)

In [7]:
# add truth values to dataframe
add_to_classifier_df(df_classifier_approach_1, 'truth', ytest)

### We are using SVM with a linear SVC to predict the rank of the top 200 chart based on the viral 50 rank.

In [8]:
# test with svm
svm_linear = SVC(kernel='linear')

svm_fit_linear = svm_linear.fit(xtrain, ytrain)
ypred = svm_fit_linear.predict(xtest)

In [9]:
print(f'accuracy of support vector machines with linear kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with linear kernel is: {svm_fit_linear.score(xtrain, ytrain)}')

accuracy of support vector machines with linear kernel is: 0.02258064516129032
r^2 score of support vector machines with linear kernel is: 0.025889967637540454


### We are using Support Vector Machine (SVM) with a polynomial kernel (SVC = poly) to predict the rank of the top 200 chart from the viral 50 rank.

In [10]:
svm_poly = SVC(kernel='poly')

svm_fit_poly = svm_poly.fit(xtrain, ytrain)
ypred = svm_fit_poly.predict(xtest)

In [11]:
print(f'accuracy of support vector machines with polynomial kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with polynomial kernel is: {svm_fit_poly.score(xtrain, ytrain)}')

accuracy of support vector machines with polynomial kernel is: 0.025806451612903226
r^2 score of support vector machines with polynomial kernel is: 0.02831715210355987


In [12]:
poly_degree = 3
polyn_reg = make_pipeline(PolynomialFeatures(poly_degree),LinearRegression()) # create polynomial regression model

polyn_reg = polyn_reg.fit(xtrain, ytrain)
ypred = polyn_reg.predict(xtest)

In [13]:
polyn_reg.score(xtrain,ytrain)# R² score for training data

0.004664919469423712

## APPROACH 2: Predict rank OF top 200 chart FROM viral 50 trend and rank
## (from viral 50 rank & trend -> top 200 rank)
We try to predict the rank of the top 200 chart from the viral 50 trend and rank to gain insight into which songs are likely to rise in popularity.

In [14]:
df_weekly_ranks_with_trend = pd.read_csv("../data/weekly_ranks.csv")
df_weekly_ranks_with_trend.drop("Unnamed: 0",axis=1,inplace=True)
df_weekly_ranks_with_trend.sample(10)

Unnamed: 0,title,date_viral,viral_50_rank,trend_viral_50,date_top200,top_200_rank
1526,It's Called: Freefall,2021-12-12,36,MOVE_DOWN,2021-12-19,177
697,Dreams - 2004 Remaster,2020-10-18,42,MOVE_DOWN,2020-10-25,14
892,Heat Waves,2021-02-14,42,MOVE_UP,2021-02-21,18
442,ROCKSTAR (feat. Roddy Ricch),2020-06-21,41,MOVE_UP,2020-06-28,1
1261,Beggin',2021-08-15,25,SAME_POSITION,2021-08-22,11
1037,Slumber Party (feat. Princess Nokia),2021-04-18,18,MOVE_DOWN,2021-04-25,115
946,drivers license,2021-03-07,42,MOVE_DOWN,2021-03-14,3
1410,Cold Heart - PNAU Remix,2021-10-24,32,MOVE_DOWN,2021-10-31,34
1151,Straightenin,2021-06-13,37,MOVE_UP,2021-06-20,64
1019,Slumber Party (feat. Princess Nokia),2021-04-11,23,SAME_POSITION,2021-04-18,109


In [15]:
# prepare new data for hypothesis testing
def add_to_classifier_df(df, classifier_name, prediction_results):
    df[classifier_name] = np.array(prediction_results)

def export_classifier_to_df(classifier_dataframe: pd.DataFrame, name: str):
    classifier_dataframe.to_csv('../data/' + name)

In [16]:
df_classifier_approach_2 = pd.DataFrame()

In [17]:
xtrain, xtest, ytrain, ytest = train_test_split(df_weekly_ranks_with_trend[["viral_50_rank", "trend_viral_50"]], df_weekly_ranks_with_trend["top_200_rank"], train_size=0.8)

In [18]:
# add truth values to dataframe
add_to_classifier_df(df_classifier_approach_2, 'truth', ytest)

### We are using SVM with a linear SVC to predict the rank of the top 200 chart based on the viral 50 trend and rank.

In [19]:
svm_linear = Pipeline([('ohe', OneHotEncoder()), ('clf', SVC(kernel='linear'))])

svm_fit_linear = svm_linear.fit(xtrain, ytrain)
ypred = svm_fit_linear.predict(xtest)

# add to classifier dataframe
add_to_classifier_df(df_classifier_approach_2, 'SVC_linear', ypred)

In [20]:
print(f'accuracy of support vector machines with linear kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with linear kernel is: {svm_fit_linear.score(xtrain, ytrain)}')

accuracy of support vector machines with linear kernel is: 0.00967741935483871
r^2 score of support vector machines with linear kernel is: 0.1156957928802589


### We are using Support Vector Machine (SVM) to predict our results, with the radial basis function (rbf) kernel in the Support Vector Classifier (SVC) algorithm.

In [21]:
svm_linear_poly = Pipeline([('ohe', OneHotEncoder()), ('clf', SVC(kernel='rbf'))])

svm_fit_poly = svm_linear_poly.fit(xtrain, ytrain)
ypred = svm_fit_poly.predict(xtest)

# add to classifier dataframe
add_to_classifier_df(df_classifier_approach_2, 'SVC_rbf', ypred)

In [22]:
print(f'accuracy of support vector machines with polynomial kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with polynomial kernel is: {svm_fit_poly.score(xtrain, ytrain)}')

accuracy of support vector machines with polynomial kernel is: 0.016129032258064516
r^2 score of support vector machines with polynomial kernel is: 0.12540453074433658


## APPROACH 3: Predict trend  OF  top 200  FROM  viral 50 chart position
## (from viral 50 rank -> top 200 trend)
We try to predict the trend of the top 200 songs by analyzing their position on the Viral 50 chart. By monitoring changes in popularity and engagement, we can anticipate which songs will rise to the top and which will fall out of favor.

In [23]:
def choose_freq_for_classification(freq):
    if freq == 1:
        return pd.read_csv('../data/daily_ranks_top_200.csv')
    elif freq == 3:
        return pd.read_csv('../data/3_days_ranks_top_200.csv')
    elif freq == 7:
        return pd.read_csv('../data/weekly_ranks_top_200.csv')
    else:
        print('Wrong input, taking weekly granularity!')
        return pd.read_csv('../data/weekly_ranks_top_200.csv')

In [24]:
df_trends_top_200 = choose_freq_for_classification(7)
df_trends_top_200

Unnamed: 0.1,Unnamed: 0,title,date_viral,viral_50_rank,date_top200,top_200_rank,trend_top_200
0,0,Adore You,2020-01-05,10,2020-01-12,18,MOVE_UP
1,1,"All I Want - From ""High School Musical: The Mu...",2020-01-05,2,2020-01-12,66,MOVE_UP
2,2,Ayy Macarena,2020-01-05,20,2020-01-12,143,MOVE_UP
3,3,Ballin' (with Roddy Ricch),2020-01-05,38,2020-01-12,13,SAME_POSITION
4,4,Blinding Lights,2020-01-05,12,2020-01-12,15,MOVE_UP
...,...,...,...,...,...,...,...
1541,9,SAD GIRLZ LUV MONEY Remix (feat. Kali Uchis an...,2021-12-19,47,2021-12-26,166,NEW_ENTRY
1542,10,Super Gremlin,2021-12-19,23,2021-12-26,6,MOVE_UP
1543,11,Surface Pressure,2021-12-19,6,2021-12-26,62,NEW_ENTRY
1544,12,We Don't Talk About Bruno,2021-12-19,1,2021-12-26,27,NEW_ENTRY


In [25]:
df_classifier_approach_3 = pd.DataFrame()

In [26]:
xtrain_3, xtest_3, ytrain_3, ytest_3 = train_test_split(df_trends_top_200[["viral_50_rank"]], df_trends_top_200["trend_top_200"], train_size=0.8)

In [27]:
# add truth value for test classifier results
add_to_classifier_df(df_classifier_approach_3, 'truth', ytest_3)

### We are using a support vector machine with a linear kernel to predict the trend of the top 200 songs on the Viral 50 chart.

In [28]:
svm_linear = SVC(kernel='linear')

svm_fit_linear = svm_linear.fit(xtrain_3, ytrain_3)
ypred_svm_linear = svm_fit_linear.predict(xtest_3)

add_to_classifier_df(df_classifier_approach_3, 'svm_linear', ypred_svm_linear)

In [29]:
print(f'accuracy of support vector machines with linear kernel is: {accuracy_score(ypred_svm_linear, ytest_3)}')
print(f'r^2 score of support vector machines with linear kernel is: {svm_fit_linear.score(xtrain_3, ytrain_3)}')

accuracy of support vector machines with linear kernel is: 0.5451612903225806
r^2 score of support vector machines with linear kernel is: 0.5355987055016181


### We are using a Support Vector Machine (SVM) algorithm with a polynomial kernal to predict the trend of the top 200 songs on the Viral 50 chart.

In [30]:
svm_poly= SVC(kernel='poly', degree=3)

svm_fit_poly = svm_poly.fit(xtrain_3, ytrain_3)
ypred_svm_poly = svm_fit_poly.predict(xtest_3)

add_to_classifier_df(df_classifier_approach_3, 'svm_poly', ypred_svm_poly)

In [31]:
print(f'accuracy of support vector machines with poly kernel is: {accuracy_score(ypred_svm_poly, ytest_3)}')
print(f'r^2 score of support vector machines with poly kernel is: {svm_fit_poly.score(xtrain_3, ytrain_3)}')

accuracy of support vector machines with poly kernel is: 0.5451612903225806
r^2 score of support vector machines with poly kernel is: 0.5355987055016181


### We are using logistic regression to predict the trend of the top 200 songs on the Viral 50 chart.

In [32]:
log_reg = LogisticRegression()

log_reg_fit = log_reg.fit(xtrain_3, ytrain_3)
ypred_log_reg = log_reg_fit.predict(xtest_3)

add_to_classifier_df(df_classifier_approach_3, 'logistic_regression', ypred_log_reg)

In [33]:
print(f'accuracy of support vector machines with linear kernel is: {accuracy_score(ypred_log_reg, ytest_3)}')
print(f'r^2 score of support vector machines with linear kernel is: {log_reg_fit.score(xtrain_3, ytrain_3)}')

accuracy of support vector machines with linear kernel is: 0.5451612903225806
r^2 score of support vector machines with linear kernel is: 0.5355987055016181


### We are using K nearest neighbors algorithm to predict the trend of the top 200 songs on the Viral 50 chart.

In [34]:
knn = KNeighborsClassifier()

knn_fit = knn.fit(xtrain_3, ytrain_3)
ypred_knn = knn_fit.predict(xtest_3)

add_to_classifier_df(df_classifier_approach_3, 'k_nearest_neighbour', ypred_knn)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [35]:
print(f'accuracy of knn is: {accuracy_score(ypred_knn, ytest_3)}')
print(f'r^2 score of knn is: {knn_fit.score(xtrain_3, ytrain_3)}')

accuracy of knn is: 0.46774193548387094
r^2 score of knn is: 0.47815533980582525


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [36]:
export_classifier_df_to_csv(df_classifier_approach_3, "classification_results_approach_3.csv")