# Abltion Analysis of The Prediction Model

Using the data stored in `top_posts.csz.gz` this script will aim to make model that can predict the number of upvotes (likes) given all other data in the file excluding `number_of_upvotes`, `total_votes`, and `number_of_downvotes`.

This script attempts to get a better understanding of how each feature influences the model through [Ablation Analysis](https://stats.stackexchange.com/questions/380040/what-is-an-ablation-study-and-is-there-a-systematic-way-to-perform-it). This means the model will be ran with specific features removed to compare their performance on the same dataset. To reduce clutter, this script uses methods stored in `ablation.py`.

In [1]:
import gzip
import nltk
import random
import numpy as np
from ablation import *
from csv import writer
from csv import DictReader
from datetime import date
from datetime import datetime
from collections import defaultdict
from sklearn.linear_model import Ridge
from nltk.tokenize import word_tokenize

In [2]:
# Download a few needed packages for the nltk 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/dwolfson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dwolfson/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Debug variables
date = date.today().strftime("%b %d")
baseline = 0
pred_mse = 0
feature_list = []
shuffle = False

In [4]:
# Most optimal size found for n-length list of popular words
n = 600

# Most optimal alpha found for Ridge Regression
alpha = 1000

In [5]:
data = []
word_popularity = defaultdict(int)

In [6]:
performance = []

In [7]:
# Open and store each post as a list of dict elements
with gzip.open('../data/top_posts.csv.gz', 'rt') as file:
    csv_reader = DictReader(file)
    
    for row in csv_reader:
        data.append(row)
        for word in word_tokenize(row['title']):
            word_popularity[word] += 1

In [8]:
word_popularity = sorted(word_popularity.items(), key=lambda item: item[1], reverse=True)
word_popularity = [pair[0] for pair in word_popularity]
word_popularity = word_popularity[:n]

In [9]:
# Shuffle the data since it's sorted by subreddit to give a fairer distribution
random.shuffle(data)
shuffle = True

In [10]:
# Store an X vector for each feature vector
X_all = []
X_exc_score = []
X_exc_num_com = []
X_exc_len_char = []
X_exc_len_word = []
X_exc_oc = []
X_exc_pos = []
X_exc_ohe = []
X_exc_popular_word = []

In [11]:
# y is the list of labels (correct values)
y = []

In [12]:
for datum in data:
    X_all.append(feature_all(datum, word_popularity, n))
    X_exc_score.append(feature_exc_score(datum, word_popularity, n))
    X_exc_num_com.append(feature_exc_num_com(datum, word_popularity, n))
    X_exc_len_char.append(feature_exc_len_char(datum, word_popularity, n))
    X_exc_len_word.append(feature_exc_len_word(datum, word_popularity, n))
    X_exc_oc.append(feature_exc_oc(datum, word_popularity, n))
    X_exc_pos.append(feature_exc_pos(datum, word_popularity, n))
    X_exc_ohe.append(feature_exc_ohe(datum, word_popularity, n))
    X_exc_popular_word.append(feature_exc_popular_word(datum, word_popularity, n))
    y.append(int(datum['number_of_upvotes']))

In [13]:
# House Cleaning
del data
del popular_words

In [14]:
# Split the datum between training (80%), validation (10%), and test (10%)
train = round(len(X_all) * 0.8)
valid = train + round(len(X_all) * 0.1)
tests = train + round(len(X_all) * 0.1)

In [15]:
y_train = y[:train]
y_valid = y[train:valid]
y_tests = y[valid:]

In [16]:
# Get a baseline by testing the model against the average label
average_likes = np.mean(y)
y_avg = [average_likes] * len(y)

In [17]:
# Get the MSE from the baseline averages
baseline = MSE(y_avg[train:valid], y_valid)

In [18]:
# Run the model
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(X_all[:train], y_train)

# Store the performance of this model
row = []
row.append(date)
row.append(baseline)
row.append(MSE(model.predict(X_all[train:valid]), y_valid))
row.append('score|number_of_comments|title_length|title_word_length|orginal_content|parts_of_speech|ohe_hour|ohe_week|{}_popular_words'.format(n))
row.append(shuffle)

performance.append(row)

del X_all

In [19]:
# Run the model
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(X_exc_score[:train], y_train)

# Store the performance of this model
row = []
row.append(date)
row.append(baseline)
row.append(MSE(model.predict(X_exc_score[train:valid]), y_valid))
row.append('number_of_comments|title_length|title_word_length|orginal_content|parts_of_speech|ohe_hour|ohe_week|{}_popular_words'.format(n))
row.append(shuffle)

performance.append(row)

del X_exc_score

In [20]:
# Run the model
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(X_exc_num_com[:train], y_train)

# Store the performance of this model
row = []
row.append(date)
row.append(baseline)
row.append(MSE(model.predict(X_exc_num_com[train:valid]), y_valid))
row.append('score|title_length|title_word_length|orginal_content|parts_of_speech|ohe_hour|ohe_week|{}_popular_words'.format(n))
row.append(shuffle)

performance.append(row)

del X_exc_num_com

In [21]:
# Run the model
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(X_exc_len_char[:train], y_train)

# Store the performance of this model
row = []
row.append(date)
row.append(baseline)
row.append(MSE(model.predict(X_exc_len_char[train:valid]), y_valid))
row.append('score|number_of_comments|title_word_length|orginal_content|parts_of_speech|ohe_hour|ohe_week|{}_popular_words'.format(n))
row.append(shuffle)

performance.append(row)

del X_exc_len_char

In [22]:
# Run the model
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(X_exc_len_word[:train], y_train)

# Store the performance of this model
row = []
row.append(date)
row.append(baseline)
row.append(MSE(model.predict(X_exc_len_word[train:valid]), y_valid))
row.append('score|number_of_comments|title_length|orginal_content|parts_of_speech|ohe_hour|ohe_week|{}_popular_words'.format(n))
row.append(shuffle)

performance.append(row)

del X_exc_len_word

In [23]:
# Run the model
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(X_exc_oc[:train], y_train)

# Store the performance of this model
row = []
row.append(date)
row.append(baseline)
row.append(MSE(model.predict(X_exc_oc[train:valid]), y_valid))
row.append('score|number_of_comments|title_length|title_word_length|parts_of_speech|ohe_hour|ohe_week|{}_popular_words'.format(n))
row.append(shuffle)

performance.append(row)

del X_exc_oc

In [24]:
# Run the model
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(X_exc_pos[:train], y_train)

# Store the performance of this model
row = []
row.append(date)
row.append(baseline)
row.append(MSE(model.predict(X_exc_pos[train:valid]), y_valid))
row.append('score|number_of_comments|title_length|title_word_length|orginal_content|ohe_hour|ohe_week|{}_popular_words'.format(n))
row.append(shuffle)

performance.append(row)

del X_exc_pos

In [25]:
# Run the model
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(X_exc_ohe[:train], y_train)

# Store the performance of this model
row = []
row.append(date)
row.append(baseline)
row.append(MSE(model.predict(X_exc_ohe[train:valid]), y_valid))
row.append('score|number_of_comments|title_length|title_word_length|orginal_content|parts_of_speech|{}_popular_words'.format(n))
row.append(shuffle)

performance.append(row)

del X_exc_ohe

In [26]:
# Run the model
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(X_exc_popular_word[:train], y_train)

# Store the performance of this model
row = []
row.append(date)
row.append(baseline)
row.append(MSE(model.predict(X_exc_popular_word[train:valid]), y_valid))
row.append('score|number_of_comments|title_length|title_word_length|orginal_content|parts_of_speech|ohe_hour|ohe_week')
row.append(shuffle)

performance.append(row)

del X_exc_popular_word

In [27]:
# Append the debug data from this script to the CSV of MSE records
with open('../data/prediction_model_MSE.csv', 'a') as file:
    csv_writer = writer(file)
    
    for row in performance:
        csv_writer.writerow(row) 