# The Prediction Model

Using the data stored in `top_posts.csz.gz` this script will aim to make model that can predict the number of upvotes (likes) given all other data in the file excluding `number_of_upvotes`, `total_votes`, and `number_of_downvotes`.

This script relies upon the results from the scripts: `optimize_top_words.ipynb` and `prediction_model_ablation.ipynb` for the feature vector, the size `n` for the most popular words, and the `alpha` value that will be assigned to the Ridge Regression Model.

In [1]:
import gzip
import nltk
import random
import numpy as np
from csv import writer
from csv import DictReader
from datetime import date
from datetime import datetime
from collections import defaultdict
from sklearn.linear_model import Ridge
from nltk.tokenize import word_tokenize

In [2]:
# Download a few needed packages for the nltk 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/dwolfson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dwolfson/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Debug variables
date = date.today().strftime("%b %d")
baseline = 0
pred_mse = 0
feature_list = []
shuffle = False

In [4]:
# Most optimal size found for n-length list of popular words
n = 600

# Most optimal alpha found for Ridge Regression
alpha = 1000

In [5]:
# Returns the MSE of a list of preditions & labels
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [6]:
# Returns a list the frequencies of a given title's Parts of Speech
def parts_of_speech(title):
    # Tokenize the words in the title
    tokens = word_tokenize(title)
    
    # Turns each token into a pair with its value and Part of Speech label
    # More Info Here: https://realpython.com/nltk-nlp-python/#tagging-parts-of-speech
    pos = nltk.pos_tag(tokens)
    
    # Map the generalized Parts of Speech to their frequency in the title
    frequencies = {
        "Adjectives":0,
        "Nouns":0,
        "Adverbs":0,
        "Pronouns":0,
        "Verbs":0,
        "Determiners":0
    }
    
    # Count the frequencies of each Part of Speech generalizing to 7 categories
    for pair in pos:
        if pair[1].startswith("JJ"):
            frequencies["Adjectives"] += 1
        elif pair[1].startswith("NN"):
            frequencies["Nouns"] += 1
        elif pair[1].startswith("RB"):
            frequencies["Adverbs"] += 1
        elif pair[1].startswith("PRP"):
            frequencies["Pronouns"] += 1
        elif pair[1].startswith("VB"):
            frequencies["Verbs"] += 1
        elif pair[1].startswith("DT"):
            frequencies["Determiners"] += 1
        else:
            pass

    return list(frequencies.values())

In [7]:
# Returns a one-hot encoding (OHE) of the hour of day, and weekday
# OHE allows for encoding a n-length list of binary features in n - 1 space
def one_hot_encoding_time(unixtime):
    hour = [0] * 23
    week = [0] * 6
    
    # Get the local time of the given unix timestamp
    time = datetime.fromtimestamp(int(float(unixtime)))
    
    # One hot encode the hour (hour 0 is just a list of 0's)
    # https://docs.python.org/3/library/datetime.html#datetime.datetime.hour
    if time.hour != 0:
        hour[time.hour - 1] = 1
    
    # One hot encode the weekday (day 0 is just a list of 0's)
    # https://docs.python.org/3/library/datetime.html#datetime.date.weekday
    if time.weekday() != 0:
        week[time.weekday() - 1] = 1
        
    return hour + week
    

In [8]:
# Helper function returns a list that represents the presence of popular words
def popular_words(title, n, n_popular_words):
    words = [0] * n
    
    for word in word_tokenize(title):
        if word in n_popular_words:
            words[n_popular_words.index(word)] = 1
    
    return words

In [9]:
# Creates a feature vector for a given row of data
def feature(datum):
    feat = [1]
    
    n_popular_words = word_popularity[:n]
    
    # Add a feature for the score (price of awards given)
    feat.append(int(datum['score']))
    
    # Add a feature for the number of comments
    feat.append(int(datum['number_of_comments']))
    
    # Add a feature for character length of title
    feat.append(len(datum['title']))
    
    # Add a feature for word length of title
    feat.append(len(word_tokenize(datum['title'])))
    
    # Add a binary feature for if the content is declared original (OC)
    feat.append(1) if "[oc]" in datum['title'].lower() else feat.append(0)
    
    # Add features for the frequencies of generalized Parts of Speech
    feat.extend(parts_of_speech(datum['title']))
    
    # Add features for the one-hot encoding of the Hour and Weekday
    feat.extend(one_hot_encoding_time(datum['unixtime']))
    
    # Add feature list for the presence of any of the n-most popular words
    feat.extend(popular_words(datum['title'], n, n_popular_words))
    
    return feat

In [10]:
# Insert debug for features
feature_list.append('score')
feature_list.append('number_of_comments')
feature_list.append('title_length')
feature_list.append('title_word_length')
feature_list.append('orginal_content')
feature_list.append('parts_of_speech')
feature_list.append('ohe_hour')
feature_list.append('ohe_week')
feature_list.append('{}_popular_words'.format(n))

In [11]:
data = []
word_popularity = defaultdict(int)

In [12]:
# Open and store each post as a list of dict elements
with gzip.open('../data/top_posts.csv.gz', 'rt') as file:
    csv_reader = DictReader(file)
    
    for row in csv_reader:
        data.append(row)
        for word in word_tokenize(row['title']):
            word_popularity[word] += 1

In [13]:
word_popularity = sorted(word_popularity.items(), key=lambda item: item[1], reverse=True)
word_popularity = [pair[0] for pair in word_popularity]

In [14]:
# Shuffle the data since it's sorted by subreddit to give a fairer distribution
random.shuffle(data)
shuffle = True

In [15]:
# X is the list of all feature vectors
X = []
# y is the list of labels (correct values)
y = []

In [16]:
for datum in data:
    X.append(feature(datum))
    y.append(int(datum['number_of_upvotes']))

In [17]:
len(X)

246472

In [18]:
len(y)

246472

In [19]:
# Split the datum between training (80%), validation (10%), and test (10%)
train = round(len(X) * 0.8)
valid = train + round(len(X) * 0.1)
tests = train + round(len(X) * 0.1)

print(f"Train Data Points:  {train}")
print(f"Valid Data Points:  {valid - train}")
print(f"Test Data Points:   {tests - train}")

Train Data Points:  197178
Valid Data Points:  24647
Test Data Points:   24647


In [20]:
X_train = X[:train]
X_valid = X[train:valid]
X_tests = X[valid:]

y_train = y[:train]
y_valid = y[train:valid]
y_tests = y[valid:]

In [21]:
# Get a baseline by testing the model against the average label
average_likes = np.mean(y)
y_avg = [average_likes] * len(y)

In [22]:
# Get more information about y
print(f"Minimum Number of Upvotes: {format(min(y), ',')}")
print(f"Maximum Number of Upvotes: {format(max(y), ',')}")
print(f"Average Number of Upvotes: {format(average_likes, ',')}")

Minimum Number of Upvotes: 0
Maximum Number of Upvotes: 430,539
Average Number of Upvotes: 14,947.346384173456


In [23]:
# Intialize and fit the model to the training datas
model = Ridge(fit_intercept=False, alpha=alpha)
model.fit(X_train, y_train)

Ridge(alpha=1000, fit_intercept=False)

In [24]:
# Test model on validation
y_valid_pred = model.predict(X_valid)

In [25]:
# Get the MSE from the baseline averages
baseline = MSE(y_avg[train:valid], y_valid)
print(format(baseline, ','))

514,176,081.0839548


In [26]:
# Get the MSE from the validation predictions
pred_mse = MSE(y_valid_pred, y_valid)
print(format(pred_mse, ','))

366,683,173.115461


In [27]:
# Append the debug data from this script to the CSV of MSE records
with open('../data/prediction_model_MSE.csv', 'a') as file:
    csv_writer = writer(file)
    
    row = []
    row.append(date)
    row.append(baseline)
    row.append(pred_mse)
    row.append('|'.join(feature_list))
    row.append(shuffle)

    csv_writer.writerow(row) 

In [28]:
# Finally run the model on the test set and report the performance (MSE)
y_tests_pred = model.predict(X_tests)
pred_mse = MSE(y_tests_pred, y_tests)
print(format(pred_mse, ','))

362,060,674.3704694
