# Optimize The Prediction Model With The Top n Words

Using the data stored in `top_posts.csz.gz` this script will aim to make model that can predict the number of upvotes (likes) given all other data in the file excluding `number_of_upvotes`, `total_votes`, and `number_of_downvotes`.

This script aims to help further optimize the actual Prediction Model script by finding the best value `n` where `n` is the best number of popular words to include as an `n`-length binary list for the feature vector. At the same time, this model finds the best `alpha` value for Ridge Regression.

In [1]:
import gzip
import nltk
from csv import DictReader
from datetime import date
from datetime import datetime
from sklearn.linear_model import Ridge
from collections import defaultdict
from nltk.tokenize import word_tokenize

In [2]:
# Download a few needed packages for the nltk 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/dwolfson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dwolfson/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Debug variables
date = date.today().strftime("%b %d")
baseline = 0
pred_mse = 0
feature_list = []
shuffle = False

In [4]:
# Returns the MSE of a list of preditions & labels
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [5]:
# Returns a list the frequencies of a given title's Parts of Speech
def parts_of_speech(title):
    # Tokenize the words in the title
    tokens = word_tokenize(title)
    
    # Turns each token into a pair with its value and Part of Speech label
    # More Info Here: https://realpython.com/nltk-nlp-python/#tagging-parts-of-speech
    pos = nltk.pos_tag(tokens)
    
    # Map the generalized Parts of Speech to their frequency in the title
    frequencies = {
        "Adjectives":0,
        "Nouns":0,
        "Adverbs":0,
        "Pronouns":0,
        "Verbs":0,
        "Determiners":0
    }
    
    # Count the frequencies of each Part of Speech generalizing to 7 categories
    for pair in pos:
        if pair[1].startswith("JJ"):
            frequencies["Adjectives"] += 1
        elif pair[1].startswith("NN"):
            frequencies["Nouns"] += 1
        elif pair[1].startswith("RB"):
            frequencies["Adverbs"] += 1
        elif pair[1].startswith("PRP"):
            frequencies["Pronouns"] += 1
        elif pair[1].startswith("VB"):
            frequencies["Verbs"] += 1
        elif pair[1].startswith("DT"):
            frequencies["Determiners"] += 1
        else:
            pass

    return list(frequencies.values())

In [6]:
# Returns a one-hot encoding (OHE) of the hour of day, and weekday
# OHE allows for encoding a n-length list of binary features in n - 1 space
def one_hot_encoding_time(unixtime):
    hour = [0] * 23
    week = [0] * 6
    
    # Get the local time of the given unix timestamp
    time = datetime.fromtimestamp(int(float(unixtime)))
    
    # One hot encode the hour (hour 0 is just a list of 0's)
    # https://docs.python.org/3/library/datetime.html#datetime.datetime.hour
    if time.hour != 0:
        hour[time.hour - 1] = 1
    
    # One hot encode the weekday (day 0 is just a list of 0's)
    # https://docs.python.org/3/library/datetime.html#datetime.date.weekday
    if time.weekday() != 0:
        week[time.weekday() - 1] = 1
        
    return hour + week
    

In [7]:
# Helper function returns a list that represents the presence of popular words
def popular_words(title, n, n_popular_words):
    words = [0] * n
    
    for word in word_tokenize(title):
        if word in n_popular_words:
            words[n_popular_words.index(word)] = 1
    
    return words

In [8]:
# Creates a feature vector for a given row of data
def feature(datum, n, n_popular_words):
    feat = [1]
    
    # Add a feature for the score (price of awards given)
    feat.append(int(datum['score']))
    
    # Add a feature for the number of comments
    feat.append(int(datum['number_of_comments']))
    
    # Add a feature for character length of title
    feat.append(len(datum['title']))
    
    # Add a feature for word length of title
    feat.append(len(word_tokenize(datum['title'])))
    
    # Add a binary feature for if the content is declared original (OC)
    feat.append(1) if "[oc]" in datum['title'].lower() else feat.append(0)
    
    # Add features for the frequencies of generalized Parts of Speech
    feat.extend(parts_of_speech(datum['title']))
    
    # Add features for the one-hot encoding of the Hour and Weekday
    feat.extend(one_hot_encoding_time(datum['unixtime']))
    
    # Add feature list for the presence of any of the n-most popular words
    feat.extend(popular_words(datum['title'], n, n_popular_words))
    
    return feat

In [9]:
data = []
word_popularity = defaultdict(int)

In [10]:
# Open and store each post as a list of dict elements
with gzip.open('../data/top_posts.csv.gz', 'rt') as file:
    csv_reader = DictReader(file)
    
    for row in csv_reader:
        data.append(row)
        for word in word_tokenize(row['title']):
            word_popularity[word] += 1

In [11]:
word_popularity = sorted(word_popularity.items(), key=lambda item: item[1], reverse=True)
word_popularity = [pair[0] for pair in word_popularity]

In [12]:
# Will store a list of MSE and their respective n
n_performance = []
lambdas = [0, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

In [13]:
# Find the best n value
for n in range(0, 1000, 100):
    # Debug to know how far the script is
    print(n)  
      
    # X is the list of all feature vectors
    X = []
    # y is the list of labels (correct values)
    y = []
    # List of the n-most popular words
    n_popular_words = word_popularity[:n]
    
    for datum in data:
        X.append(feature(datum, n, n_popular_words))
        y.append(int(datum['number_of_upvotes']))
    
    # Split the datum between training (80%), validation (10%), and test (10%)
    train = round(len(X) * 0.8)
    valid = train + round(len(X) * 0.1)
    tests = train + round(len(X) * 0.1)
    
    X_train = X[:train]
    X_valid = X[train:valid]
    X_tests = X[valid:]

    y_train = y[:train]
    y_valid = y[train:valid]
    y_tests = y[valid:]
    
    # Intialize and fit the model to the training datas
    model = Ridge(1.0, fit_intercept=False)
    
    # Try different alpha values for Ridge Regression
    for alpha in lambdas:
        model.set_params(alpha=alpha)
        model.fit(X_train, y_train)
    
        # Test model on validation
        y_valid_pred = model.predict(X_valid)
        n_performance.append((MSE(y_valid_pred, y_valid), n, alpha))

0
100
200
300
400
500
600
700


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


800


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


900


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


In [14]:
# Report the best MSE and n
best_pair = min(n_performance, key = lambda x: x[0])
print("Best Pair: {}".format(best_pair))

Best Pair: (139794359.08308473, 600, 1000)


In [15]:
n_performance

[(144666612.36568236, 0, 0),
 (144666611.2953804, 0, 0.001),
 (144666601.66258094, 0, 0.01),
 (144666505.32726023, 0, 0.1),
 (144665541.2440276, 0, 1),
 (144655830.0795116, 0, 10),
 (144553767.74233425, 0, 100),
 (143486990.92661342, 0, 1000),
 (145941030.4527238, 100, 0),
 (145941026.06890044, 100, 0.001),
 (145940986.6149597, 100, 0.01),
 (145940592.12177563, 100, 0.1),
 (145936651.39476955, 100, 1),
 (145897498.9800274, 100, 10),
 (145519397.08860675, 100, 100),
 (142488273.04050204, 100, 1000),
 (145444486.1367397, 200, 0),
 (145444481.1469494, 200, 0.001),
 (145444436.24238572, 200, 0.01),
 (145443987.54322547, 200, 0.1),
 (145439530.77721465, 200, 1),
 (145396269.88762012, 200, 10),
 (144989620.21423835, 200, 100),
 (141950481.80758524, 200, 1000),
 (145227573.5781384, 300, 0),
 (145227565.99832186, 300, 0.001),
 (145227497.78565672, 300, 0.01),
 (145226816.21644714, 300, 0.1),
 (145220050.82917964, 300, 1),
 (145155268.00705808, 300, 10),
 (144615886.31820154, 300, 100),
 (14148