In [1]:
import pandas as pd
import numpy as np
import pickle
import plotly.graph_objects as go

from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

#importing the data
x = pd.read_csv('x.csv').drop('Unnamed: 0', axis=1)
y = pd.read_csv('y.csv').drop('Unnamed: 0', axis=1)

#storing the details
x_cols = list(x.columns)
titles = list(y.columns)

x_cols

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 'posted_at',
 'post_type_link',
 'post_type_music',
 'post_type_note',
 'post_type_offer',
 'post_type_photo',
 'post_type_status',
 'post_type_video',
 'status_type_added_video',
 'status_type_created_event',
 'status_type_created_note',
 'status_type_mobile_status_update',
 'status_type_published_story',
 'status_type_shared_story',
 'source_bbc',
 'source_cbs',
 'source_cnn',
 'source_lat',
 'source_nbc',
 'source_nyt',
 'source_wsj']

In [2]:
#Specific cols
num_cols = ['posted_at', '0', '1', '2', '3', '4', '5', '6']
cols = [col for col in x_cols if col not in num_cols]

#preprocessing the values
preprocess = ColumnTransformer([
    (
        'numeric',
        MinMaxScaler(),
        num_cols
    )
], remainder='passthrough')

x = preprocess.fit_transform(x)

#Getting the final order of columns right...
final_cols = []
final_cols.extend(num_cols)
final_cols.extend(cols)
# print(final_cols)

x_true = pd.DataFrame(x, columns = final_cols)

#Train,test split
x_train,x_val,y_train,y_val = train_test_split(x, np.array(y.values), test_size=0.1, shuffle=True, random_state=35)

In [3]:
def save_model(pkl_model,name):
    pkl_dir = './models/ml/'
    pkl_filename =pkl_dir+ name+'.pkl'
    with open(pkl_filename, 'wb') as file:
        pickle.dump(pkl_model, file)

## Decision Tree

In [4]:
tree = DecisionTreeRegressor()
tree.fit(x_train,y_train)
pred = tree.predict(x_val)
save_model(tree, 'tree')
pred

array([[4.531e+03, 6.790e+02, 1.458e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [6.300e+01, 4.100e+01, 8.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.120e+02, 1.200e+01, 2.400e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.150e+02, 1.170e+02, 4.100e+01, ..., 2.000e+00, 0.000e+00,
        1.800e+01],
       [2.250e+02, 7.000e+00, 3.300e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.700e+02, 7.300e+01, 6.700e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [5]:
decision_tree_scores = []

for i in range(9):
    mse = mean_squared_error(y_true=y_val[:,i],y_pred=pred[:,i])
    mae = mean_absolute_error(y_pred=pred[:,i],y_true=y_val[:,i])
    print(f'Mean Squared Error for parameter - {i}: {mse}')
    print(f'Mean Absolute Error for parameter - {i}: {mae}')
    decision_tree_scores.append([mse,mae])
    print('\n')

decision_tree_scores.append([mean_squared_error(y_true=y_val,y_pred=pred),mean_absolute_error(y_true=y_val,y_pred=pred)])


Mean Squared Error for parameter - 0: 155427075.79255384
Mean Absolute Error for parameter - 0: 3213.723166991915


Mean Squared Error for parameter - 1: 2034161.994916539
Mean Absolute Error for parameter - 1: 407.59093601895734


Mean Squared Error for parameter - 2: 110771795.85685723
Mean Absolute Error for parameter - 2: 1225.407617786451


Mean Squared Error for parameter - 3: 955020.6008546487
Mean Absolute Error for parameter - 3: 103.14678003902984


Mean Squared Error for parameter - 4: 177643.06560147754
Mean Absolute Error for parameter - 4: 50.589489824365764


Mean Squared Error for parameter - 5: 261887.60554171313
Mean Absolute Error for parameter - 5: 51.392737663785894


Mean Squared Error for parameter - 6: 1194716.8559381098
Mean Absolute Error for parameter - 6: 109.17047672149428


Mean Squared Error for parameter - 7: 19.748222748815166
Mean Absolute Error for parameter - 7: 0.10360328965709506


Mean Squared Error for parameter - 8: 245445.88145560358
Mean Absol

In [6]:
#Scores for estimation
df = pd.DataFrame(decision_tree_scores, columns=['D-Tree: MSE', 'D-Tree: MAE'])

In [7]:
df

Unnamed: 0,D-Tree: MSE,D-Tree: MAE
0,155427100.0,3213.723167
1,2034162.0,407.590936
2,110771800.0,1225.407618
3,955020.6,103.14678
4,177643.1,50.58949
5,261887.6,51.392738
6,1194717.0,109.170477
7,19.74822,0.103603
8,245445.9,62.593968
9,30118640.0,580.413197


## Linear Regression

In [8]:
linear = LinearRegression()
linear.fit(x_train,y_train)
pred = linear.predict(x_val)
pred

array([[ 1.89821431e+03,  3.77100781e+02,  5.18675763e+02, ...,
         5.98889116e+01,  1.29839587e-02,  4.81661634e+01],
       [ 5.63820589e+02,  1.32401367e+02,  2.07415830e+02, ...,
        -1.88244357e+01, -4.03289076e-02, -5.29825708e+00],
       [ 2.24379550e+02,  4.67564166e+01,  3.95421647e+01, ...,
        -9.37006205e+00, -2.90435723e-02, -6.07405874e-01],
       ...,
       [ 1.15194958e+03,  1.52696231e+02,  2.80257042e+02, ...,
         6.69989147e+01,  5.36724263e-02,  4.93259114e+01],
       [ 1.60134605e+02,  2.42599005e+01, -4.21637426e+01, ...,
        -1.11635254e+01, -3.51447711e-02, -2.00306544e+00],
       [ 5.56812095e+02,  7.41114346e+01,  1.36785062e+02, ...,
         2.71787130e+00,  7.68578926e-02,  3.79325132e+00]])

In [9]:

save_model(linear, 'linear_reg')

linear_regression_scores = []

for i in range(9):
    mse = mean_squared_error(y_true=y_val[:,i],y_pred=pred[:,i])
    mae = mean_absolute_error(y_pred=pred[:,i],y_true=y_val[:,i])
    print(f'Mean Squared Error for parameter - {i}: {mse}')
    print(f'Mean Absolute Error for parameter - {i}: {mae}')
    linear_regression_scores.append([mse,mae])
    print('\n')

linear_regression_scores.append([mean_squared_error(y_true=y_val,y_pred=pred),mean_absolute_error(y_true=y_val,y_pred=pred)])
df = pd.concat([df,pd.DataFrame(linear_regression_scores, columns=['Linear: MSE', 'Linear: MAE'])],
          axis=1)

Mean Squared Error for parameter - 0: 53697273.6670696
Mean Absolute Error for parameter - 0: 2527.2910190351754


Mean Squared Error for parameter - 1: 526587.4806407465
Mean Absolute Error for parameter - 1: 313.42099817978465


Mean Squared Error for parameter - 2: 22787796.5495937
Mean Absolute Error for parameter - 2: 964.1215841482918


Mean Squared Error for parameter - 3: 377344.6044159607
Mean Absolute Error for parameter - 3: 116.84565302541033


Mean Squared Error for parameter - 4: 50373.31509349026
Mean Absolute Error for parameter - 4: 53.140339968859315


Mean Squared Error for parameter - 5: 142957.96963121952
Mean Absolute Error for parameter - 5: 57.18502746623867


Mean Squared Error for parameter - 6: 834264.8763750159
Mean Absolute Error for parameter - 6: 120.44196797472078


Mean Squared Error for parameter - 7: 16.48753696761974
Mean Absolute Error for parameter - 7: 0.1444522770757158


Mean Squared Error for parameter - 8: 165361.8997871118
Mean Absolute Error

## Perceptron Model

In [10]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout

In [14]:
model = Sequential(
    [
        Dense(128, input_dim=28, activation='relu', kernel_initializer='normal'),
        Dropout(0.2),
        Dense(64, activation='relu', kernel_initializer='normal'),
        Dropout(0.3),
        Dense(32, activation='relu', kernel_initializer='normal'),
        Dense(9, kernel_initializer='normal')
    ]
)

model.compile(loss='mse', optimizer='adam', metrics=[ tf.keras.metrics.MeanSquaredError()])

In [12]:
x_train.shape

(258255, 28)

In [13]:
model.fit(x_train,y_train, epochs=100, batch_size=64)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

KeyboardInterrupt: 

In [None]:
pred = model.predict(x_val)
save_model(model.get_weights(), 'mlp')
mean_squared_error(y_val,pred)



8551660.8224382

In [None]:
perceptron_scores = []

for i in range(9):
    mse = mean_squared_error(y_true=y_val[:,i],y_pred=pred[:,i])
    mae = mean_absolute_error(y_pred=pred[:,i],y_true=y_val[:,i])
    print(f'Mean Squared Error for parameter - {i}: {mse}')
    print(f'Mean Absolute Error for parameter - {i}: {mae}')
    perceptron_scores.append([mse,mae])
    print('\n')

perceptron_scores.append([mean_squared_error(y_true=y_val,y_pred=pred),mean_absolute_error(y_true=y_val,y_pred=pred)])
df = pd.concat([df,pd.DataFrame(perceptron_scores, columns=['Perceptron: MSE', 'Perceptron: MAE'])],
          axis=1)

Mean Squared Error for parameter - 0: 52234964.5916712
Mean Absolute Error for parameter - 0: 2317.5895466353854


Mean Squared Error for parameter - 1: 530076.9305467502
Mean Absolute Error for parameter - 1: 322.34879506289013


Mean Squared Error for parameter - 2: 22648936.779867854
Mean Absolute Error for parameter - 2: 902.1285160480518


Mean Squared Error for parameter - 3: 368524.59328898333
Mean Absolute Error for parameter - 3: 98.04973729545105


Mean Squared Error for parameter - 4: 48801.08206270511
Mean Absolute Error for parameter - 4: 43.85214703556868


Mean Squared Error for parameter - 5: 142112.8598613282
Mean Absolute Error for parameter - 5: 54.58217834495372


Mean Squared Error for parameter - 6: 827438.1763346178
Mean Absolute Error for parameter - 6: 99.63161903033192


Mean Squared Error for parameter - 7: 23.759674158001495
Mean Absolute Error for parameter - 7: 1.7878587650550937


Mean Squared Error for parameter - 8: 164068.6286368086
Mean Absolute Error

## Poisson Regressor

In [None]:
from sklearn.linear_model import PoissonRegressor
model_poisson = MultiOutputRegressor(PoissonRegressor(max_iter=750))
model_poisson.fit(x_train,y_train)
pred = model_poisson.predict(x_val)

save_model(model_poisson, 'poisson')

In [None]:
poisson_scores = []

for i in range(9):
    mse = mean_squared_error(y_true=y_val[:,i],y_pred=pred[:,i])
    mae = mean_absolute_error(y_pred=pred[:,i],y_true=y_val[:,i])
    print(f'Mean Squared Error for parameter - {i}: {mse}')
    print(f'Mean Absolute Error for parameter - {i}: {mae}')
    poisson_scores.append([mse,mae])
    print('\n')

poisson_scores.append([mean_squared_error(y_true=y_val,y_pred=pred),mean_absolute_error(y_true=y_val,y_pred=pred)])
df = pd.concat([df,pd.DataFrame(poisson_scores, columns=['Poisson: MSE', 'Poisson: MAE'])],
          axis=1)

Mean Squared Error for parameter - 0: 53539484.079763964
Mean Absolute Error for parameter - 0: 2476.3155666088255


Mean Squared Error for parameter - 1: 525430.7909710949
Mean Absolute Error for parameter - 1: 309.69550002198275


Mean Squared Error for parameter - 2: 22763826.167400718
Mean Absolute Error for parameter - 2: 951.3843142407246


Mean Squared Error for parameter - 3: 369868.061903982
Mean Absolute Error for parameter - 3: 92.60498207098482


Mean Squared Error for parameter - 4: 49444.73869530452
Mean Absolute Error for parameter - 4: 43.75426268666298


Mean Squared Error for parameter - 5: 142139.37166297334
Mean Absolute Error for parameter - 5: 47.15254501310381


Mean Squared Error for parameter - 6: 830292.865747621
Mean Absolute Error for parameter - 6: 100.63708311233765


Mean Squared Error for parameter - 7: 16.49299391264631
Mean Absolute Error for parameter - 7: 0.12315077717015262


Mean Squared Error for parameter - 8: 164225.3613842202
Mean Absolute Erro

# Observing the results

In [None]:
mae_cols = [col for col in df.columns if 'MAE' in col]
mse_cols = [col for col in df.columns if 'MSE' in col]

In [None]:
df[mse_cols]

Unnamed: 0,D-Tree: MSE,Linear: MSE,Perceptron: MSE,Poisson: MSE
0,153233000.0,53697270.0,52234960.0,53539480.0
1,1963522.0,526587.5,530076.9,525430.8
2,107852600.0,22787800.0,22648940.0,22763830.0
3,950505.6,377344.6,368524.6,369868.1
4,160273.7,50373.32,48801.08,49444.74
5,260043.4,142958.0,142112.9,142139.4
6,1247015.0,834264.9,827438.2,830292.9
7,19.82959,16.48754,23.75967,16.49299
8,224266.7,165361.9,164068.6,164225.4
9,29543470.0,8731331.0,8551661.0,8709414.0


In [None]:
titles

['likes_count',
 'comments_count',
 'shares_count',
 'love_count',
 'wow_count',
 'haha_count',
 'sad_count',
 'thankful_count',
 'angry_count']

In [None]:
titles.extend(["Overall"])

In [None]:
fig= go.Figure()

for i in range(10):
    fig.add_trace(go.Scatter(x=df[mae_cols].columns, y =df[mae_cols].values[i],
                             mode='lines+markers', name=titles[i]))

fig.show()

In [None]:
import plotly.graph_objects as go

fig= go.Figure()

for i in range(10):
    fig.add_trace(go.Scatter(x=df[mse_cols].columns, y =df[mse_cols].values[i],
                             mode='lines+markers', name=titles[i]))

fig.show()

In [None]:
mse_cols.remove('D-Tree: MSE')

In [None]:
mse_cols

['Linear: MSE', 'Perceptron: MSE', 'Poisson: MSE']

In [None]:
import plotly.graph_objects as go

fig= go.Figure()

for i in range(10):
    fig.add_trace(go.Scatter(x=df[mse_cols].columns, y =np.sqrt(df[mse_cols].values[i]),
                             mode='lines+markers', name=titles[i]))

fig.show()

Based on the scores that are obtained, the Perceptron model is currently the best performing model. Hence, we will be finding it's variable importance.

### Variable importance

In [15]:
import pickle

with open('./models/ml/mlp.pkl', 'rb') as file:
    model.set_weights(pickle.load(file))

In [16]:
from sklearn.inspection import permutation_importance

r = permutation_importance(model, x_val, y_val,n_repeats=50,random_state=35, scoring='neg_mean_absolute_percentage_error')



In [17]:
r

{'importances_mean': array([ 3.69748456e+16,  6.41408120e+14,  1.15203891e+15,  1.33937814e+15,
         9.19752447e+14,  6.98202741e+14,  1.07430287e+15,  1.24193055e+15,
         8.33010174e+15,  1.58397326e+12,  0.00000000e+00,  0.00000000e+00,
         4.23844275e+15,  5.34881241e+13, -4.73600024e+15, -5.18450087e+15,
         3.44820904e+13, -7.88363032e+11, -2.54309804e+14, -6.80611742e+13,
        -6.33280522e+15,  1.99762697e+14,  2.42495435e+15,  1.56581891e+14,
         8.66484639e+14,  2.34784444e+15,  1.30957934e+15,  4.49934490e+15]),
 'importances_std': array([4.95316847e+14, 1.31960213e+14, 1.35394695e+14, 1.16088879e+14,
        1.37742604e+14, 1.02625596e+14, 1.41830499e+14, 1.25853860e+14,
        1.92619538e+14, 2.01611963e+12, 0.00000000e+00, 0.00000000e+00,
        1.92348324e+14, 7.38620584e+13, 1.39894312e+14, 2.01453675e+14,
        2.44638596e+13, 2.39838873e+12, 1.01793523e+14, 4.92665833e+13,
        2.70396298e+14, 1.48851806e+14, 2.22452507e+14, 1.58663972e

In [18]:
import plotly.graph_objects as go

fig= go.Figure()
fig.add_trace(go.Bar(x=final_cols,y=r['importances_mean'], marker_color='rgb(62, 109, 156)',error_y=dict(type='data',array = r['importances_std'])))

fig.update_layout(template = 'plotly_dark', barmode='group', title_text='Feature Importance for Videos: Poisson Regression', xaxis_tickangle=-90)
fig.show()