# Facebook Metrics

## Package Import and Configuration

In [0]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

## Data Import and Preprocessing

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/dsb/master/Data%20Sets/Demos%20and%20Exercises/facebook_metrics/facebook_metrics.csv', sep=';')

### Date Preview

In [3]:
# Preview the First 5 Rows.
data.head()

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
0,139441,Photo,2,12,4,3,0.0,2752,5091,178,109,159,3078,1640,119,4,79.0,17.0,100
1,139441,Status,2,12,3,10,0.0,10460,19057,1457,1361,1674,11710,6112,1108,5,130.0,29.0,164
2,139441,Photo,3,12,3,3,0.0,2413,4373,177,113,154,2812,1503,132,0,66.0,14.0,80
3,139441,Photo,2,12,2,10,1.0,50128,87991,2211,790,1119,61027,32048,1386,58,1572.0,147.0,1777
4,139441,Photo,2,12,2,3,0.0,7244,13594,671,410,580,6228,3200,396,19,325.0,49.0,393


In [4]:
# Display summary statistics.
data.describe()

Unnamed: 0,Page total likes,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
count,500.0,500.0,500.0,500.0,500.0,499.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,499.0,496.0,500.0
mean,123194.176,1.88,7.038,4.15,7.84,0.278557,13903.36,29585.95,920.344,798.772,1415.13,16766.38,6585.488,609.986,7.482,177.945892,27.266129,212.12
std,16272.813214,0.852675,3.307936,2.030701,4.368589,0.448739,22740.78789,76803.25,985.016636,882.505013,2000.594118,59791.02,7682.009405,612.725618,21.18091,323.398742,42.613292,380.233118
min,81370.0,1.0,1.0,1.0,1.0,0.0,238.0,570.0,9.0,9.0,9.0,567.0,236.0,9.0,0.0,0.0,0.0,0.0
25%,112676.0,1.0,4.0,2.0,3.0,0.0,3315.0,5694.75,393.75,332.5,509.25,3969.75,2181.5,291.0,1.0,56.5,10.0,71.0
50%,129600.0,2.0,7.0,4.0,9.0,0.0,5281.0,9051.0,625.5,551.5,851.0,6255.5,3417.0,412.0,3.0,101.0,19.0,123.5
75%,136393.0,3.0,10.0,6.0,11.0,1.0,13168.0,22085.5,1062.0,955.5,1463.0,14860.5,7989.0,656.25,7.0,187.5,32.25,228.5
max,139441.0,3.0,12.0,7.0,23.0,1.0,180480.0,1110282.0,11452.0,11328.0,19779.0,1107833.0,51456.0,4376.0,372.0,5172.0,790.0,6334.0


### Date Preprocessing

In [0]:
# Delete columns that contain future information (Lookahead Bias) and are not part of the target variable.
data.drop(data.columns[7:18], axis=1, inplace=True)

# Initialize label encoder for labeling categorical values.
label_encoder = LabelEncoder()
label_encoder.fit(data['Type'].values)

# Coding Categorical Values into Labeled Values
data['Type'] = label_encoder.transform(data['Type'].values)

# Drop rows that contain at least one empty value.
data.dropna(inplace=True)

# Transform dataset in to a feature matrix X and a target vector y.
X, y = data[data.columns[0:7]].values, data['Total Interactions'].values

In [6]:
# Preview the First 5 Rows after Data Preprocessing.
data.head()

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Total Interactions
0,139441,1,2,12,4,3,0.0,100
1,139441,2,2,12,3,10,0.0,164
2,139441,1,3,12,3,3,0.0,80
3,139441,1,2,12,2,10,1.0,1777
4,139441,1,2,12,2,3,0.0,393


In [7]:
# Display summary statistics after Data Preprocessing.
data.describe()

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Total Interactions
count,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0
mean,123277.991984,1.074148,1.87976,7.0501,4.150301,7.847695,0.278557,212.306613
std,16180.74698,0.430411,0.853513,3.30016,2.032728,4.369579,0.448739,380.591766
min,81370.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,113028.0,1.0,1.0,4.0,2.0,3.0,0.0,71.0
50%,129600.0,1.0,2.0,7.0,4.0,9.0,0.0,124.0
75%,136393.0,1.0,3.0,10.0,6.0,11.0,1.0,229.0
max,139441.0,3.0,3.0,12.0,7.0,23.0,1.0,6334.0


In [8]:
# Display encoded categories.
label_encoder.classes_

array(['Link', 'Photo', 'Status', 'Video'], dtype=object)

## Modelling

### Training with Linear Regression and Stochastic Gradient Descent

In [9]:
# Set Seed for "Deterministic Randomness".
np.random.seed(1909)

# Initialize the Linear Regression Model.
linear_model = SGDRegressor(max_iter=1000, eta0=0.0000000001, penalty='l2', alpha=0.0001, loss='squared_loss')

# Perform the Learning.
linear_model.fit(X, y)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=1e-10,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

### Sample Prediction

In [10]:
# What is the expected amount of total interactions when a facebook page with with 80000 likes posts a paid Category 2 Photo on
# a wednesday at 3pm in March?
page_total_likes, post_type, category, post_month, post_weekday, post_hour, paid = 80000, 2, 2, 3, 3, 15, 1
sample_prediction = linear_model.predict([[page_total_likes, post_type, category, post_month, post_weekday, post_hour, paid]])
print(f'Expected Total Interactions: {sample_prediction[0]:.0f}')

Expected Total Interactions: 114


### Decision Support System

In [0]:
# Iterate through all possible post types, months, weekdays, hours and paid flags
# for a fixed amount of page likes (80000) and fixed category (3).

results = []
feature_sets = []


for post_type in range(0, 4):
  for post_month in range(1, 13):
    for post_weekday in range(1, 8):
      for post_hour in range(0, 25):
        for paid in range(0, 2):
          prediction = [[80000, post_type, 3, post_month, post_weekday, post_hour, paid]]

          result = linear_model.predict(prediction)

          results.append(result[0])
          feature_sets.append([80000, post_type, 3, post_month, post_weekday, post_hour, paid])

### Result

In [12]:
idx_best_result = results.index(max(results))
best_feature_set = feature_sets[idx_best_result]
decoded_post_type = label_encoder.inverse_transform([best_feature_set[1]])[0]

print(f'Out of {len(results)} combinations, the feature set with index {idx_best_result} performed best.')
print(f'Expected Total Interactions: {results[idx_best_result]:.2f}')
print(f'Post Type: {decoded_post_type}, Hour: {best_feature_set[3]}, Hour: {best_feature_set[4]}, Hour: {best_feature_set[5]}, Paid: {best_feature_set[6]}')

Out of 16800 combinations, the feature set with index 12601 performed best.
Expected Total Interactions: 113.65
Post Type: Video, Hour: 1, Hour: 1, Hour: 0, Paid: 1


  if diff:


## Evaluation

In [0]:
# Symmetric Mean Absolute Percentage Error
def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

### 10-Fold-Cross-Validation

In [0]:
# Set seed for "deterministic randomness".
np.random.seed(1909)

# Initialize 10-Fold-Cross-Validation.
k_fold = KFold(n_splits=10)

# Empty lists for persisting the performances measures calculated in each iteration. 
rmses = []
smapes = []

for train_idx, test_idx in k_fold.split(X):
  # Split dataset into a train and test set.
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]
  
  # Run the logistic regression on the current training set.
  linear_model.fit(X_train, y_train)
  
  # Perform the Prediction on the Test-Set-Features.
  y_pred = linear_model.predict(X_test)
  
  # Calculate the performance measures on the current test set.
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  smape = symmetric_mean_absolute_percentage_error(y_test, y_pred)
  
  # Append to performances measures to lists.
  rmses.append(rmse)
  smapes.append(smape)

#### Result

In [15]:
print(f'Average RMSE: {np.mean(rmses):.2f}')
print(f'Average SMAPE: {np.mean(smapes):.2f}%')

Average RMSE: 336.52
Average SMAPE: 81.54%
