# Optimize The Prediction Model With Ridge Regression

Using the data stored in `top_posts.csz.gz` this script will aim to make model that can predict the number of upvotes (likes) given all other data in the file excluding `number_of_upvotes`, `total_votes`, and `number_of_downvotes`.

This script uses Ridge Regression, and tries to optmize the model's performance based on the lambda/alpha value of the model.

In [1]:
import gzip
import nltk
import numpy as np
from csv import DictReader
from datetime import date
from datetime import datetime
from sklearn import linear_model
from sklearn.linear_model import Ridge
from nltk.tokenize import word_tokenize

In [2]:
# Download a few needed packages for the nltk 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/dwolfson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dwolfson/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Debug variables
date = date.today().strftime("%b %d")
baseline = 0
pred_mse = 0
feature_list = []
shuffle = False

In [4]:
# Returns the MSE of a list of preditions & labels
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [5]:
# Returns a list the frequencies of a given title's Parts of Speech
def parts_of_speech(title):
    # Tokenize the words in the title
    tokens = word_tokenize(title)
    
    # Turns each token into a pair with its value and Part of Speech label
    # More Info Here: https://realpython.com/nltk-nlp-python/#tagging-parts-of-speech
    pos = nltk.pos_tag(tokens)
    
    # Map the generalized Parts of Speech to their frequency in the title
    frequencies = {
        "Adjectives":0,
        "Nouns":0,
        "Adverbs":0,
        "Pronouns":0,
        "Verbs":0,
        "Determiners":0
    }
    
    # Count the frequencies of each Part of Speech generalizing to 7 categories
    for pair in pos:
        if pair[1].startswith("JJ"):
            frequencies["Adjectives"] += 1
        elif pair[1].startswith("NN"):
            frequencies["Nouns"] += 1
        elif pair[1].startswith("RB"):
            frequencies["Adverbs"] += 1
        elif pair[1].startswith("PRP"):
            frequencies["Pronouns"] += 1
        elif pair[1].startswith("VB"):
            frequencies["Verbs"] += 1
        elif pair[1].startswith("DT"):
            frequencies["Determiners"] += 1
        else:
            pass

    return list(frequencies.values())

In [6]:
# Returns a one-hot encoding (OHE) of the hour of day, and weekday
# OHE allows for encoding a n-length list of binary features in n - 1 space
def one_hot_encoding_time(unixtime):
    hour = [0] * 23
    week = [0] * 6
    
    # Get the local time of the given unix timestamp
    time = datetime.fromtimestamp(int(float(unixtime)))
    
    # One hot encode the hour (hour 0 is just a list of 0's)
    # https://docs.python.org/3/library/datetime.html#datetime.datetime.hour
    if time.hour != 0:
        hour[time.hour - 1] = 1
    
    # One hot encode the weekday (day 0 is just a list of 0's)
    # https://docs.python.org/3/library/datetime.html#datetime.date.weekday
    if time.weekday() != 0:
        week[time.weekday() - 1] = 1
        
    return hour + week
    

In [7]:
# Creates a feature vector for a given row of data
def feature(datum):
    feat = [1]
    
    # Add a feature for the score (price of awards given)
    feat.append(int(datum['score']))
    
    # Add a feature for the number of comments
    feat.append(int(datum['number_of_comments']))
    
    # Add a feature for character length of title
    # feat.append(len(datum['title']))
    
    # Add a feature for word length of title
    # feat.append(len(datum['title'].strip().split(' ')))
    
    # Add features for the frequencies of generalized Parts of Speech
    feat.extend(parts_of_speech(datum['title']))
    
    # Add features for the one-hot encoding of the Hour and Weekday
    feat.extend(one_hot_encoding_time(datum['unixtime']))
    
    return feat

In [8]:
# Insert debug for features
feature_list.append('score')
feature_list.append('number_of_comments')
feature_list.append('parts_of_speech')
feature_list.append('ohe_hour')
feature_list.append('ohe_week')

In [9]:
data = []

In [10]:
# Open and store each post as a list of dict elements
with gzip.open('../data/top_posts.csv.gz', 'rt') as file:
    csv_reader = DictReader(file)
    
    for row in csv_reader:
        data.append(row)

In [11]:
# X is the list of all feature vectors
X = []
# y is the list of labels (correct values)
y = []

In [12]:
for datum in data:
    X.append(feature(datum))
    y.append(int(datum['number_of_upvotes']))

In [13]:
# Split the datum between training (80%), validation (10%), and test (10%)
train = round(len(X) * 0.8)
valid = train + round(len(X) * 0.1)
tests = train + round(len(X) * 0.1)

In [14]:
X_train = X[:train]
X_valid = X[train:valid]
X_tests = X[valid:]

y_train = y[:train]
y_valid = y[train:valid]
y_tests = y[valid:]

In [15]:
# Intialize and fit the model to the training datas
model = Ridge(1.0, fit_intercept=False)
lambdas = [0, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

# Store the MSE of the model on each lambda
valid_perf = []

In [16]:
for alpha in lambdas:
    model.set_params(alpha = alpha)
    model.fit(X_train, y_train)

    # Get the predictions for train and validation at lambda
    pred_valid = model.predict(X_valid)

    # Store the performance (MSE)
    valid_perf.append(MSE(pred_valid, y_valid))

# Get the best perfoming lambda for validation and run test set on it
index = valid_perf.index(min(valid_perf))
best_lambda = lambdas[index]

print("Best Lambda: {}\tBest MSE: {}".format(best_lambda, min(valid_perf)))
print(valid_perf)

Best Lambda: 1000	Best MSE: 142242452.0761398
[143563528.69676974, 143563527.453563, 143563516.2646242, 143563404.36827308, 143562284.71015686, 143551021.21061182, 143433725.32018587, 142242452.0761398]
