In [132]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

In [170]:
def export_classifier_df_to_csv(df: pd.DataFrame, filename: str):
    df.to_csv("../data/" + filename)

In [None]:
# prepare new data for hypothesis testing
def add_to_classifier_df(df, classifier_name, prediction_results):
    df[classifier_name] = np.array(prediction_results)

def export_classifier_to_df(classifier_dataframe: pd.DataFrame, name: str):
    classifier_dataframe.to_csv('../data/' + name)

## APPROACH 1: Try to predict rank directly (from viral 50 -> top 200)

In [None]:
avg_ranks = pd.read_csv("../data/weekly_ranks.csv")
avg_ranks.drop("Unnamed: 0",axis=1,inplace=True)
avg_ranks.sample(10)

Unnamed: 0,title,date_viral,viral_50_rank,date_top200,top_200_rank
10463,love nwantiti (ah ah ah),2021-11-01,33.0,2021-11-02,32.0
1278,The Box,2020-02-29,23.0,2020-03-01,1.0
4127,Head & Heart (feat. MNEK),2020-08-22,14.0,2020-08-23,107.0
5882,Way Out (feat. Big Sean),2021-01-04,32.0,2021-01-05,23.0
7002,Please Don't Go,2021-03-21,33.0,2021-03-22,93.0
403,ROXANNE,2020-01-15,26.0,2020-01-16,3.0
7377,telepatía,2021-04-13,36.0,2021-04-14,14.0
10401,Somebody's Watching Me,2021-10-29,21.0,2021-10-30,131.0
8182,Ramen & OJ,2021-06-05,40.0,2021-06-06,89.0
2031,Skechers,2020-04-05,1.0,2020-04-06,69.0


In [None]:
df_classifier_approach_1 = pd.DataFrame()

In [72]:
xtrain, xtest, ytrain, ytest = train_test_split(avg_ranks["viral_50_rank"].values,avg_ranks["top_200_rank"].values, train_size=0.8)
xtrain = xtrain.reshape(-1,1)
xtest = xtest.reshape(-1,1)

In [None]:
# add truth values to dataframe
add_to_classifier_df(df_classifier_approach_1, 'truth', ytest)

In [73]:
# test with svm
svm_linear = SVC(kernel='linear')

svm_fit_linear = svm_linear.fit(xtrain, ytrain)
ypred = svm_fit_linear.predict(xtest)

In [74]:
print(f'accuracy of support vector machines with linear kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with linear kernel is: {svm_fit_linear.score(xtrain, ytrain)}')

accuracy of support vector machines with linear kernel is: 0.028684907325684024
r^2 score of support vector machines with linear kernel is: 0.029687672442335285


In [75]:
svm_poly = SVC(kernel='poly')

svm_fit_poly = svm_poly.fit(xtrain, ytrain)
ypred = svm_fit_poly.predict(xtest)

In [76]:
print(f'accuracy of support vector machines with polynomial kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with polynomial kernel is: {svm_fit_poly.score(xtrain, ytrain)}')

accuracy of support vector machines with polynomial kernel is: 0.02912621359223301
r^2 score of support vector machines with polynomial kernel is: 0.029908398631497628


In [77]:
poly_degree = 3
polyn_reg = make_pipeline(PolynomialFeatures(poly_degree),LinearRegression()) # create polynomial regression model

polyn_reg = polyn_reg.fit(xtrain, ytrain)
ypred = polyn_reg.predict(xtest)

In [78]:
polyn_reg.score(xtrain,ytrain)# R² score for training data

0.002598725700015181

### APPROACH 2: Try with trend and viral 50 to predict top 200

In [79]:
avg_ranks_with_trend = pd.read_csv("../data/weekly_ranks_with_trends.csv")
avg_ranks_with_trend.drop("Unnamed: 0",axis=1,inplace=True)
avg_ranks_with_trend.sample(10)

Unnamed: 0,title,date_viral,viral_50_rank,date_top200,top_200_rank,trend
1854,Skechers,2020-03-29,1.0,2020-03-30,97.0,SAME_POSITION
1022,Sunday Best,2020-02-14,22.0,2020-02-15,65.0,MOVE_UP
9886,Happier Than Ever - Edit,2021-09-29,46.0,2021-09-30,102.0,MOVE_UP
11326,abcdefu,2021-12-30,18.0,2021-12-31,5.0,MOVE_DOWN
9313,INDUSTRY BABY (feat. Jack Harlow),2021-08-22,23.0,2021-08-23,2.0,SAME_POSITION
10629,Toxic,2021-11-10,19.0,2021-11-11,55.0,MOVE_UP
2769,Sunday Best,2020-05-24,48.0,2020-05-25,17.0,MOVE_DOWN
6071,WITHOUT YOU,2021-01-19,48.0,2021-01-20,7.0,MOVE_UP
7330,Ruff Ryders' Anthem,2021-04-10,4.0,2021-04-11,35.0,MOVE_UP
23,Suicidal,2020-01-01,11.0,2020-01-02,12.0,SAME_POSITION


In [124]:
# prepare new data for hypothesis testing
def add_to_classifier_df(df, classifier_name, prediction_results):
    df[classifier_name] = np.array(prediction_results)

def export_classifier_to_df(classifier_dataframe: pd.DataFrame, name: str):
    classifier_dataframe.to_csv('../data/' + name)

In [None]:
df_classifier_approach_2 = pd.DataFrame()

In [82]:
xtrain, xtest, ytrain, ytest = train_test_split(avg_ranks_with_trend[["viral_50_rank", "trend"]], avg_ranks_with_trend["top_200_rank"], train_size=0.8)

In [83]:
# add truth values to dataframe
add_to_classifier_df(df_classifier_approach_2, 'truth', ytest)

In [84]:
svm_linear = Pipeline([('ohe', OneHotEncoder()), ('clf', SVC(kernel='linear'))])

svm_fit_linear = svm_linear.fit(xtrain, ytrain)
ypred = svm_fit_linear.predict(xtest)

# add to classifier dataframe
add_to_classifier_df(df_classifier_approach_2, 'SVC_linear', ypred)

In [85]:
print(f'accuracy of support vector machines with linear kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with linear kernel is: {svm_fit_linear.score(xtrain, ytrain)}')

accuracy of support vector machines with linear kernel is: 0.019858781994704325
r^2 score of support vector machines with linear kernel is: 0.04602141044034875


In [86]:
svm_linear_poly = Pipeline([('ohe', OneHotEncoder()), ('clf', SVC(kernel='rbf'))])

svm_fit_poly = svm_linear_poly.fit(xtrain, ytrain)
ypred = svm_fit_poly.predict(xtest)

# add to classifier dataframe
add_to_classifier_df(df_classifier_approach_2, 'SVC_rbf', ypred)

In [87]:
print(f'accuracy of support vector machines with polynomial kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with polynomial kernel is: {svm_fit_poly.score(xtrain, ytrain)}')

accuracy of support vector machines with polynomial kernel is: 0.02471315092674316
r^2 score of support vector machines with polynomial kernel is: 0.058050987749696505


## APPROACH 3: Try to predict trend of top 200 charts from viral chart position

In [199]:
def choose_freq_for_classification(freq):
    if freq == 1:
        return pd.read_csv('../data/daily_ranks_with_trends_top_200.csv')
    elif freq == 3:
        return pd.read_csv('../data/3_days_ranks_with_trends_top_200.csv')
    elif freq == 7:
        return pd.read_csv('../data/weekly_ranks_with_trends_top_200.csv')
    else:
        print('Wrong input, taking weekly granularity!')
        return pd.read_csv('../data/weekly_ranks_with_trends_top_200.csv')

In [200]:
df_trends_top_200 = choose_freq_for_classification(7)
df_trends_top_200

Unnamed: 0.1,Unnamed: 0,title,date_viral,viral_50_rank,date_top200,top_200_rank,trend_top_200
0,0,Adore You,2020-01-05,10,2020-01-12,18,MOVE_UP
1,1,"All I Want - From ""High School Musical: The Mu...",2020-01-05,2,2020-01-12,66,MOVE_UP
2,2,Ayy Macarena,2020-01-05,20,2020-01-12,143,MOVE_UP
3,3,Ballin' (with Roddy Ricch),2020-01-05,38,2020-01-12,13,SAME_POSITION
4,4,Blinding Lights,2020-01-05,12,2020-01-12,15,MOVE_UP
...,...,...,...,...,...,...,...
1541,9,SAD GIRLZ LUV MONEY Remix (feat. Kali Uchis an...,2021-12-19,47,2021-12-26,166,NEW_ENTRY
1542,10,Super Gremlin,2021-12-19,23,2021-12-26,6,MOVE_UP
1543,11,Surface Pressure,2021-12-19,6,2021-12-26,62,NEW_ENTRY
1544,12,We Don't Talk About Bruno,2021-12-19,1,2021-12-26,27,NEW_ENTRY


In [187]:
df_classifier_approach_3 = pd.DataFrame()

In [188]:
xtrain_3, xtest_3, ytrain_3, ytest_3 = train_test_split(df_trends_top_200[["viral_50_rank"]], df_trends_top_200["trend_top_200"], train_size=0.8)

In [189]:
# add truth value for test classifier results
add_to_classifier_df(df_classifier_approach_3, 'truth', ytest_3)

In [190]:
svm_linear = SVC(kernel='linear')

svm_fit_linear = svm_linear.fit(xtrain_3, ytrain_3)
ypred_svm_linear = svm_fit_linear.predict(xtest_3)

add_to_classifier_df(df_classifier_approach_3, 'svm_linear', ypred_svm_linear)

In [191]:
print(f'accuracy of support vector machines with linear kernel is: {accuracy_score(ypred_svm_linear, ytest_3)}')
print(f'r^2 score of support vector machines with linear kernel is: {svm_fit_linear.score(xtrain_3, ytrain_3)}')

accuracy of support vector machines with linear kernel is: 0.5806451612903226
r^2 score of support vector machines with linear kernel is: 0.5266990291262136


In [192]:
svm_poly= SVC(kernel='poly', degree=3)

svm_fit_poly = svm_poly.fit(xtrain_3, ytrain_3)
ypred_svm_poly = svm_fit_poly.predict(xtest_3)

add_to_classifier_df(df_classifier_approach_3, 'svm_poly', ypred_svm_poly)

In [193]:
print(f'accuracy of support vector machines with poly kernel is: {accuracy_score(ypred_svm_poly, ytest_3)}')
print(f'r^2 score of support vector machines with poly kernel is: {svm_fit_poly.score(xtrain_3, ytrain_3)}')

accuracy of support vector machines with poly kernel is: 0.5806451612903226
r^2 score of support vector machines with poly kernel is: 0.5266990291262136


In [194]:
log_reg = LogisticRegression()

log_reg_fit = log_reg.fit(xtrain_3, ytrain_3)
ypred_log_reg = log_reg_fit.predict(xtest_3)

add_to_classifier_df(df_classifier_approach_3, 'logistic_regression', ypred_log_reg)

In [195]:
print(f'accuracy of support vector machines with linear kernel is: {accuracy_score(ypred_log_reg, ytest_3)}')
print(f'r^2 score of support vector machines with linear kernel is: {log_reg_fit.score(xtrain_3, ytrain_3)}')

accuracy of support vector machines with linear kernel is: 0.5806451612903226
r^2 score of support vector machines with linear kernel is: 0.5266990291262136


In [196]:
knn = KNeighborsClassifier()

knn_fit = knn.fit(xtrain_3, ytrain_3)
ypred_knn = knn_fit.predict(xtest_3)

add_to_classifier_df(df_classifier_approach_3, 'k_nearest_neighbour', ypred_knn)

In [197]:
print(f'accuracy of knn is: {accuracy_score(ypred_knn, ytest_3)}')
print(f'r^2 score of knn is: {knn_fit.score(xtrain_3, ytrain_3)}')

accuracy of knn is: 0.45806451612903226
r^2 score of knn is: 0.45145631067961167


In [198]:
export_classifier_df_to_csv(df_classifier_approach_3, "classification_results_approach_3.csv")