# Modeling 2

This notebook investigates if separating the data by artist leads to improved predictive capability.

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import time

In [2]:
X_train = pd.read_csv('Data/X_train_v2.csv',index_col = 0)
X_test = pd.read_csv('Data/X_test_v2.csv',index_col = 0)
y_train = pd.read_csv('Data/y_train_v2.csv',index_col = 0)
y_test = pd.read_csv('Data/y_test_v2.csv',index_col = 0)

In [3]:
X = pd.concat([X_train,X_test])
y = pd.concat([y_train,y_test])

## Build Linear Models for Each Artist

In [4]:
def LinearRegressor(X_train,X_test,y_train,y_test):
    lr = LinearRegression()

    lr_fit_start = time.time()
    lr.fit(X_train,y_train['favorites'])
    lr_fit_end = time.time()
    lr_fit_time = lr_fit_end - lr_fit_start

    lr_pred_start = time.time()
    lr_pred = lr.predict(X_test)
    lr_pred_end = time.time()
    lr_pred_time = lr_pred_end - lr_pred_start

    mse_lr_test = mean_squared_error(y_test['favorites'],lr_pred)
    mse_lr_train = mean_squared_error(y_train['favorites'],lr.predict(X_train))

    print('The train mse for the linear regression model is ' + str(round(mse_lr_train,3)))
    print('The test mse for the linear regression model is ' + str(round(mse_lr_test,3)))
    print('The model fitting time is ' + str(round(lr_fit_time,3)))
    print('The model prediction time is ' + str(round(lr_pred_time,3)))
    
    return mse_lr_train, mse_lr_test

In [5]:
alison_x =  X[X.screen_name_awonderland==1]
alison_y = y.loc[alison_x.index]
X_train, X_test, y_train, y_test = train_test_split(alison_x, alison_y, test_size=0.25, random_state=42)
alison_train_mse, alison_test_mse = LinearRegressor(X_train, X_test, y_train, y_test)

The train mse for the linear regression model is 6768748.22
The test mse for the linear regression model is 8230582.227
The model fitting time is 0.016
The model prediction time is 0.004


In [6]:
ltc_x =  X[X.screen_name_LouisTheChild==1]
ltc_y = y.loc[ltc_x.index]
X_train, X_test, y_train, y_test = train_test_split(ltc_x, ltc_y, test_size=0.25, random_state=42)
ltc_train_mse, ltc_test_mse = LinearRegressor(X_train, X_test, y_train, y_test)

The train mse for the linear regression model is 873052.738
The test mse for the linear regression model is 679448.061
The model fitting time is 0.009
The model prediction time is 0.006


In [7]:
nght_x =  X[X.screen_name_NGHTMRE==1]
nght_y = y.loc[nght_x.index]
X_train, X_test, y_train, y_test = train_test_split(nght_x, nght_y, test_size=0.25, random_state=42)
nght_train_mse, nght_test_mse = LinearRegressor(X_train, X_test, y_train, y_test)

The train mse for the linear regression model is 643872.158
The test mse for the linear regression model is 458418.765
The model fitting time is 0.008
The model prediction time is 0.004


In [8]:
what_x =  X[X.screen_name_WhatSoNot==1]
what_y = y.loc[what_x.index]
X_train, X_test, y_train, y_test = train_test_split(what_x, what_y, test_size=0.25, random_state=42)
what_train_mse, what_test_mse = LinearRegressor(X_train, X_test, y_train, y_test)

The train mse for the linear regression model is 90979.37
The test mse for the linear regression model is 43013.378
The model fitting time is 0.01
The model prediction time is 0.003


In [9]:
duk_x =  X[X.screen_name_pekingduk==1]
duk_y = y.loc[duk_x.index]
X_train, X_test, y_train, y_test = train_test_split(duk_x, duk_y, test_size=0.25, random_state=42)
duk_train_mse, duk_test_mse = LinearRegressor(X_train, X_test, y_train, y_test)

The train mse for the linear regression model is 6285.289
The test mse for the linear regression model is 3717.03
The model fitting time is 0.008
The model prediction time is 0.005


In [10]:
porter_x =  X[X.screen_name_porterrobinson==1]
porter_y = y.loc[porter_x.index]
X_train, X_test, y_train, y_test = train_test_split(porter_x, porter_y, test_size=0.25, random_state=42)
porter_train_mse, porter_test_mse = LinearRegressor(X_train, X_test, y_train, y_test)

The train mse for the linear regression model is 17530967.758
The test mse for the linear regression model is 17065591.984
The model fitting time is 0.008
The model prediction time is 0.004


In [11]:
san_x =  X[X.screen_name_sanholobeats==1]
san_y = y.loc[san_x.index]
X_train, X_test, y_train, y_test = train_test_split(san_x, san_y, test_size=0.25, random_state=42)
san_train_mse, san_test_mse = LinearRegressor(X_train, X_test, y_train, y_test)

The train mse for the linear regression model is 874473.351
The test mse for the linear regression model is 1487673.534
The model fitting time is 0.006
The model prediction time is 0.004


In [12]:
knocks_x =  X[X.screen_name_theknocks==1]
knocks_y = y.loc[knocks_x.index]
X_train, X_test, y_train, y_test = train_test_split(knocks_x, knocks_y, test_size=0.25, random_state=42)
knocks_train_mse, knocks_test_mse = LinearRegressor(X_train, X_test, y_train, y_test)

The train mse for the linear regression model is 7858.945
The test mse for the linear regression model is 3970.866
The model fitting time is 0.008
The model prediction time is 0.005


In [13]:
galantis_x =  X[X.screen_name_wearegalantis==1]
galantis_y = y.loc[galantis_x.index]
X_train, X_test, y_train, y_test = train_test_split(galantis_x, galantis_y, test_size=0.25, random_state=42)
galantis_train_mse, galantis_test_mse = LinearRegressor(X_train, X_test, y_train, y_test)

The train mse for the linear regression model is 2295202.064
The test mse for the linear regression model is 3260179.532
The model fitting time is 0.011
The model prediction time is 0.006


In [14]:
jai_x =  X[(X.screen_name_awonderland==0) & (X.screen_name_wearegalantis==0) & (X.screen_name_theknocks==0) & (X.screen_name_sanholobeats==0) & (X.screen_name_porterrobinson==0) & (X.screen_name_pekingduk==0) & (X.screen_name_WhatSoNot==0) & (X.screen_name_NGHTMRE==0) & (X.screen_name_LouisTheChild==0)]
jai_y = y.loc[jai_x.index]
X_train, X_test, y_train, y_test = train_test_split(jai_x, jai_y, test_size=0.25, random_state=42)
jai_train_mse, jai_test_mse = LinearRegressor(X_train, X_test, y_train, y_test)

The train mse for the linear regression model is 334925.289
The test mse for the linear regression model is 2277983.631
The model fitting time is 0.006
The model prediction time is 0.003


In [15]:
artists = ['awonderland','wearegalantis','sanholobeats','theknocks','pekingduk','porterrobinson','WhatSoNot','LouisTheChild','NGHTMRE','JaiWolfx']
train_mse = [alison_train_mse,galantis_train_mse,san_train_mse,knocks_train_mse,duk_train_mse,porter_train_mse,what_train_mse,ltc_train_mse,nght_train_mse,jai_train_mse]
test_mse = [alison_test_mse,galantis_test_mse,san_test_mse,knocks_test_mse,duk_test_mse,porter_test_mse,what_test_mse,ltc_test_mse,nght_test_mse,jai_test_mse]
train_size = [len(alison_x),len(galantis_x),len(san_x),len(knocks_x),len(duk_x),len(porter_x),len(what_x),len(ltc_x),len(nght_x),len(jai_x)]


results = pd.DataFrame({'Artist':artists,'Training Size':train_size,'Train MSE':train_mse,'Test MSE':test_mse})

## Results

In [16]:
results.set_index('Artist',inplace=True)
results = results.astype(int)
results

Unnamed: 0_level_0,Training Size,Train MSE,Test MSE
Artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
awonderland,694,6768748,8230582
wearegalantis,1594,2295202,3260179
sanholobeats,662,874473,1487673
theknocks,1484,7858,3970
pekingduk,817,6285,3717
porterrobinson,1219,17530967,17065591
WhatSoNot,1585,90979,43013
LouisTheChild,1054,873052,679448
NGHTMRE,1099,643872,458418
JaiWolfx,1681,334925,2277983


Some artists have much more predictable favorites while others are quite unpredictable. Further investigation would have to be done to determine the cause. One hypothesis I have is that the lower error artists simply have fewer viral tweets, which are much harder for a model to predict.