In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

In [3]:
avg_ranks = pd.read_csv("../data/weekly_ranks.csv")
avg_ranks.drop("Unnamed: 0",axis=1,inplace=True)
avg_ranks.sample(10)

Unnamed: 0,title,date_viral,viral_50_rank,date_top200,top_200_rank
667,Head & Heart (feat. MNEK),2020-10-04,44.0,2020-10-11,81.0
1533,By Your Side,2021-12-19,37.0,2021-12-26,64.0
1383,Meet Me At Our Spot,2021-10-10,12.0,2021-10-17,10.0
1461,Cold Heart - PNAU Remix,2021-11-14,35.0,2021-11-21,24.0
1041,Beautiful Mistakes (feat. Megan Thee Stallion),2021-04-25,49.0,2021-05-02,32.0
650,Head & Heart (feat. MNEK),2020-09-27,34.0,2020-10-04,89.0
367,Deep End Freestyle,2020-05-17,4.0,2020-05-24,50.0
1539,Notion,2021-12-19,36.0,2021-12-26,133.0
877,ROLLIN N CONTROLLIN FREESTYLE,2021-02-07,2.0,2021-02-14,104.0
300,WHATS POPPIN,2020-04-12,31.0,2020-04-19,7.0


In [4]:
xtrain, xtest, ytrain, ytest = train_test_split(avg_ranks["viral_50_rank"].values,avg_ranks["top_200_rank"].values, train_size=0.8)
xtrain = xtrain.reshape(-1,1)
#ytrain = ytrain.reshape(-1,1)
xtest = xtest.reshape(-1,1)
#ytest = ytest.reshape(-1,1)

In [16]:
# test with svm
svm_linear = SVC(kernel='linear')

svm_fit_linear = svm_linear.fit(xtrain, ytrain)
ypred = svm_fit_linear.predict(xtest)

In [18]:
print(f'accuracy of support vector machines with linear kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with linear kernel is: {svm_fit_linear.score(xtrain, ytrain)}')

accuracy of support vector machines with linear kernel is: 0.02258064516129032
accuracy of support vector machines with linear kernel is: 0.025889967637540454


In [19]:
svm_poly = SVC(kernel='poly')

svm_fit_poly = svm_poly.fit(xtrain, ytrain)
ypred = svm_fit_poly.predict(xtest)

In [20]:
print(f'accuracy of support vector machines with polynomial kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with polynomial kernel is: {svm_fit_poly.score(xtrain, ytrain)}')

accuracy of support vector machines with polynomial kernel is: 0.01935483870967742
r^2 score of support vector machines with polynomial kernel is: 0.02912621359223301


In [6]:
poly_degree = 3
polyn_reg = make_pipeline(PolynomialFeatures(poly_degree),LinearRegression()) # create polynomial regression model
#TODO get data into correct format for fitting
polyn_reg = polyn_reg.fit(xtrain, ytrain)
ypred = polyn_reg.predict(xtest)

In [7]:
polyn_reg.score(xtrain,ytrain)# R² score for training data

0.004332477904229748

### Try with trend

In [39]:
avg_ranks_with_trend = pd.read_csv("../data/weekly_ranks_with_trends.csv")
avg_ranks_with_trend.drop("Unnamed: 0",axis=1,inplace=True)
avg_ranks_with_trend.sample(10)

Unnamed: 0,title,date_viral,viral_50_rank,date_top200,top_200_rank,trend
360,Supalonely,2020-05-10,35.0,2020-05-17,26.0,MOVE_DOWN
419,"...And To Those I Love, Thanks For Sticking Ar...",2020-06-14,33.0,2020-06-21,80.0,MOVE_UP
728,Sofia,2020-11-01,37.0,2020-11-08,33.0,MOVE_DOWN
1455,Toxic,2021-11-07,14.0,2021-11-14,88.0,MOVE_UP
1046,Fiel,2021-04-25,14.0,2021-05-02,46.0,SAME_POSITION
1471,Notion,2021-11-14,8.0,2021-11-21,74.0,SAME_POSITION
72,Say So,2020-01-19,9.0,2020-01-26,21.0,MOVE_DOWN
1364,Jugaste y Sufrí,2021-10-03,10.0,2021-10-10,23.0,SAME_POSITION
134,CITY OF ANGELS,2020-02-16,45.0,2020-02-23,72.0,NEW_ENTRY
755,Lonely (with benny blanco),2020-11-22,44.0,2020-11-29,45.0,MOVE_DOWN


In [None]:
# change dtype to integer instead of float
avg_ranks_with_trend['viral_50_rank'] = avg_ranks_with_trend['viral_50_rank'].astype('int64')
avg_ranks_with_trend['top_200_rank'] = avg_ranks_with_trend['top_200_rank'].astype('int64')

In [41]:
xtrain, xtest, ytrain, ytest = train_test_split(avg_ranks_with_trend[["viral_50_rank", "trend"]], avg_ranks_with_trend["top_200_rank"], train_size=0.8)

In [42]:
svm_linear = Pipeline([('ohe', OneHotEncoder()), ('clf', SVC(kernel='linear'))])

svm_fit_linear = svm_linear.fit(xtrain, ytrain)
ypred = svm_fit_linear.predict(xtest)

In [43]:
print(f'accuracy of support vector machines with linear kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with linear kernel is: {svm_fit_linear.score(xtrain, ytrain)}')

accuracy of support vector machines with linear kernel is: 0.012903225806451613
r^2 score of support vector machines with linear kernel is: 0.1156957928802589


In [44]:
svm_linear_poly = Pipeline([('ohe', OneHotEncoder()), ('clf', SVC(kernel='rbf'))])

svm_fit_poly = svm_linear_poly.fit(xtrain, ytrain)
ypred = svm_fit_poly.predict(xtest)

In [45]:
print(f'accuracy of support vector machines with polynomial kernel is: {accuracy_score(ypred, ytest)}')
print(f'r^2 score of support vector machines with polynomial kernel is: {svm_fit_poly.score(xtrain, ytrain)}')

accuracy of support vector machines with polynomial kernel is: 0.016129032258064516
r^2 score of support vector machines with polynomial kernel is: 0.11893203883495146
