## This repository studies the relation between upvotes and strings in the title, i.e., my goal is to predict the number of subreddit Upvotes based on the title of a post with a simple Linear regression model.  

Import Libraries

In [1]:
import pandas as pd
import numpy as np
from dask import dataframe as dd  ## pip install dask 
import time
import os
from dask.distributed import Client
import nltk
from nltk.corpus import stopwords
import re

from PIL import Image, ImageDraw, ImageFont
import textwrap

import glob
import csv
import random

import statistics
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

# Clean up RAM
import gc
gc.collect()

0

In [2]:
#### Get the size of the data
print(os.path.getsize('Eluvio_DS_Challenge.csv')/1024/1024/1024 , "GB")

0.07651892490684986 GB


In [3]:
#### dask load dataset
dask_df = dd.read_csv('Eluvio_DS_Challenge.csv')
dask_df ## check the dataframe structure

Unnamed: 0_level_0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,int64,object,int64,int64,object,bool,object,object
,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...


In [4]:
#### relation between dask and pd, by adding ".compute()"
type(dask_df.partitions[0].compute()) 

pandas.core.frame.DataFrame

In [5]:
#### Check partitions
print(len(dask_df.index))  ## total rows (not including the title) 
##print(dask_df.columns) 
##dask_df.head()  ## print out samples
dask_df.map_partitions(len).compute()

509236


0    394261
1    114975
dtype: int64

In [6]:
#### filter out all rows containing one or more missing values
##df = dask_df.dropna()  
df = dask_df

### Convert titles into images (this step may take a few hours)

In [7]:
#### Hyperparameters for image generation
fontname = "calibri.ttf" #### font family
fontsize = 11   
font = ImageFont.truetype(fontname, fontsize)
Max_W, Max_H = 160, 150

gap = 10000   ## row limits for sub csv file

#### function for saving csv file
def write_csv_feature(data): ## no space between two lines
    with open('dataset.csv', 'a', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(data)

In [8]:
for v in range(1):
    index = random.randint(0, len(df['title'])-1)
    text = df['title'].compute().iloc[index]
    text = " ".join(text.split())  ## remove unnessary blanks
    para = textwrap.wrap(text, width=30)  ## break the long line
    img = Image.new('L', (Max_W, Max_H), "white")  ## modes 'L', 'RGB'
    draw = ImageDraw.Draw(img)
    current_h, pad = 5, 2 ## text starting position
    for line in para:
        w, h = draw.textsize(line, font=font)
        draw.text(((Max_W - w) / 2, current_h), line, fill="black",font=font)
        current_h += h + pad
    img.show() 
    img = img.resize((80,75))
    img.show()

In [9]:
if not os.path.exists('Titles1'):
        os.makedirs('Titles1')
for step in range(len(df.index)//gap+1): 
    filename = './Titles1/dataset'+str(step)+'.csv'
    
    for i in range(gap*step, min(gap*(step+1), len(df.index))): #len(df.index)
        #start = time.time() 
        #if i%1000==0:
        #    print(i)
        text = df['title'].compute().iloc[i]
        text = " ".join(text.split())  ## remove unnessary blanks
        para = textwrap.wrap(text, width=30)  ## break the long line
        img = Image.new('L', (Max_W, Max_H), "white")  ## modes 'L', 'RGB'
        draw = ImageDraw.Draw(img)
        current_h, pad = 5, 2 ## starting position
        for line in para:
            w, h = draw.textsize(line, font=font)
            draw.text(((Max_W - w) / 2, current_h), line, fill="black", font=font)
            current_h += h + pad
        if not os.path.exists('Titles'):
            os.makedirs('Titles')    
        #img.save("./Titles/image_"+str(i)+".png")  ## If you want to save the images

        pixels = np.array(img.resize((80,75))).flatten()  ## reduce size
        #print(pixels)
        write_csv_feature(filename, pixels/255)  ## write to csv file continuously 

        #end = time.time()
        #print('Time: ', end-start, '\n')
    
    if step%20 == 0:    
        print('File {} is generated.'.format(filename))
    ## the dataset file is a huge dataset

File ./Titles1/dataset0.csv is generated.
File ./Titles1/dataset20.csv is generated.
File ./Titles1/dataset40.csv is generated.


### A simple linear regression model using sklearn

Incremental Learning with sklearn: partial_fit()

In [10]:
lr = SGDRegressor()
for step in range(len(df.index)//gap+1):
    #print('step: ', step)
    filename = './Titles1/dataset'+str(step)+'.csv'
    feature = pd.read_csv(filename)
    label = df["up_votes"].compute().iloc[gap*step:min(gap*(step+1), len(df.index))-1]
    X, X_t, y, y_t = train_test_split(feature, label, test_size=0.2, random_state=1)
    test_data = pd.DataFrame.from_records(X_t)
    test_data.to_csv('./Titles1/testing'+str(step)+'.csv', header=False, index=False)
    with open("./Titles1/testing_label"+str(step)+".csv","w") as f:
        wr = csv.writer(f, delimiter="\n")
        wr.writerow(y_t)
    lr.partial_fit(X, y)  ## not overwrite the model's previous parameters
    if step%20 == 0:
        print('Step {} is done!\n'.format(step))

mse_list = []
for step in range(len(df.index)//gap+1):
    X_T = pd.read_csv('./Titles1/testing'+str(step)+'.csv')
    X_T = pd.DataFrame(X_T)
    predictions = lr.predict(X_T)
    #print('predictions: ', predictions[0:10])
    y_T = pd.read_csv("./Titles1/testing_label"+str(step)+".csv")
    y_T = pd.DataFrame(y_T)
    #print('the true upvote: ', y_T[0:10])
    mse = mean_squared_error(predictions, y_T)
    print(mse)
    mse_list.append(mse)

print('Average mse: ', statistics.mean(mse_list))
print('Std mse: ', statistics.stdev(mse_list))

print('std: ', df['up_votes'].std())
print('mean: ', df['up_votes'].mean())

Step 0 is done!

Step 20 is done!

Step 40 is done!

1.3289048032230215e+24
1.3029815089016052e+24
1.313068357408645e+24
1.3168499244503483e+24
1.3182448831993055e+24
1.3345570678429843e+24
1.3261920267136884e+24
1.3087975104619275e+24
1.326595836295427e+24
1.3139430539595557e+24
1.3540806730019007e+24
1.308798087204283e+24
1.3366258287810967e+24
1.3317082306242103e+24
1.3469568669446417e+24
1.3345091519160095e+24
1.316783680819574e+24
1.331051281657633e+24
1.3199601789657164e+24
1.309571789455776e+24
1.3155857766290906e+24
1.3070318388921715e+24
1.3317881863187894e+24
1.3336799708921306e+24
1.33364831493215e+24
1.307638062059872e+24
1.306380386892128e+24
1.315401934551662e+24
1.3346162367092735e+24
1.3155148516387748e+24
1.3354299383592063e+24
1.2903560611586978e+24
1.3245894931966984e+24
1.3101738482423917e+24
1.3067266803372542e+24
1.306589239626316e+24
1.3237621684508654e+24
1.3119279098588633e+24
1.3226586119657984e+24
1.316395345773604e+24
1.3135681522341608e+24
1.30648828829033e

### Conclusion: 

The given Eluvio_DS_Challenge.csv file has a 112+/-542 upvotes. However, our model achieves a bad result. The potential reasons for this failure are:
1. The SGDRegressor Model does not fit our dataset.
2. The model may be underfitted as both training acc and testing acc are low.
3. The training sample's feature dimension is not appropriate. In our experiment, its dimension is 6000 (i.e., 80*75).
4. The title is not the only factor that determines the upvotes.

Further improvement can be made by (1) including the author and submission time, (2) introducing more advance machine learning tools, say deep neural network, to approximate the nonlinear relation between upvotes and titles, (3) convolutional neural network can be implemented to handle the image input. 

### References

1. https://github.com/shachi01/dask_in_python_ml/blob/master/efficient_read_csv.ipynb
2. https://stackoverflow.com/questions/17856242/how-to-convert-a-string-to-an-image 
3. https://towardsdatascience.com/the-art-of-the-upvote-using-nlp-to-predict-upvotes-based-on-headline-458408be3c73