In [1]:
import pandas as pd
import importlib

import sys 
import os

os.chdir("../Feature_Design")
from author_properties_transformers import author_influence
from word2vec_transformers import conceptcluster, tokenizer
import data_preprocessor
os.chdir("../Regression")

from datetime import timezone, datetime
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn import tree
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import make_pipeline, make_union
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier, DummyRegressor
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.preprocessing import FunctionTransformer
import seaborn as sns




# Load and clean data:

In [2]:
def load(subreddit, nrows = None):
    df_messy = pd.read_csv("../Data/subreddit_" + subreddit + "/full.csv", nrows = nrows)

    # This removes weekly threads, and rows that have been deleted. Also removes posts that are not text posts.
    # Creates more granular time features as well.
    df_messy, award_cols = data_preprocessor.preprocess(df_messy)
    
    # TODO: this may be too severe of a cleanup.
    

    # make tokenized columns for the NLP methods
    tokenizer().in_place(X = df_messy, cols = ['title', 'selftext'])

    cols = [  'id',  'author', 'score', 'ups', 'downs', 'gilded', 'upvote_ratio', 'total_awards_received', 'num_comments', 'weektime', 'time_of_day', 'tokenized_title', 'tokenized_selftext'] + award_cols

    df = df_messy[ cols]
    return df, award_cols, df_messy


for subreddit in ["WallStreetBets"]: #, "TraditionalCurses", "WritingPrompts", "TwoSentenceHorror", "Jokes"]:
    #df, award_cols, df_messy = load(subreddit, nrows = None)
    #df.to_csv("../Data/cleaned_" + subreddit + ".csv")

    df = pd.read_csv("../Data/cleaned_" + subreddit + ".csv")

In [12]:
award_cols = [col for col in list(df.columns) if col[:6] == "award_"]

## Create feature transformers for author features, time features, and nlp features.
For details on the author and nlp transformers see, ../Feature_Design/author_properties_transformers and ../Feature_Design/word2vec_transformers

Briefly, they do the following:

1. Author influence calculates statistics of an authors upvote history from posts in the train set. 

2. Conceptcluster trains a word2vec model, then clusters the word vectors into concepts, and then counts the number of times each concept appear in the title or the selftext. Various options for the clustering approach are possible.

In [14]:
author_stat_features = FeatureUnion([('aggsum', author_influence(kind = 'sum')), 
                           ('aggmean', author_influence(kind = 'mean')), 
                           ('aggmedian', author_influence(kind = 'median')),
                            ('aggcount', author_influence(kind = 'count')),
                            ('aggupvote_ratio', author_influence(kind = 'beta_shrinkage_upvoteratio', prior = 'empirical_bayes')),
                              #('aggpowerlaw', author_influence(kind = 'power_law')) # slows things down a lot      
                           
                           ])


In [15]:

time_features = make_union( make_column_transformer(  
                                (FunctionTransformer(lambda x : x), ["weektime", "time_of_day"] ),
                                                   )            
                          )


In [16]:
nlp_features = make_union( conceptcluster(  roughclustersize = 15 , verbose = False) )

In [17]:


all_features = make_union ( author_stat_features, time_features, nlp_features )

## Run a regression experiment:

Trying to predict upvotes using those features. No luck so far. 

Comments: xgboost overfits, linear regression underfits.
 

In [18]:

def run_experiment(model, X, y):
    

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=1)

    print("working on", model)

    pipe = make_pipeline(all_features, model, verbose = True)
    # all features drops the 'ups' category.
    
    pipe.fit(X_train, y_train)

    pipe.predict(X_test)

    sns.scatterplot(x = pipe.predict(X_test), y = y_test)
    plt.show()
    sns.scatterplot(x = pipe.predict(X_train), y = y_train)
    plt.show()
    
    scores = cross_val_score(pipe, X, y)
    print(scores)
    
    return pipe

'''
pipes = []
for model in [LinearRegression(), xgb.XGBRegressor(random_state=2)]:
    X = df.drop(columns=['total_awards_received', 'num_comments'])
    y = df.ups
    pipes.append(run_experiment(model, X, y))
'''

"\npipes = []\nfor model in [LinearRegression(), xgb.XGBRegressor(random_state=2)]:\n    X = df.drop(columns=['total_awards_received', 'num_comments'])\n    y = df.ups\n    pipes.append(run_experiment(model, X, y))\n"

In [19]:
#cc = pipes[0]['featureunion'].get_params()['featureunion-3'].get_params()['conceptcluster']
#cc.corpus_df.prediction.value_counts().head(25)
#cc.corpus_df[ cc.corpus_df.prediction == 45]

In [None]:

def run_experiment_classification(model, X, y):
    

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=1)

    print("working on", model)

    pipe = make_pipeline(all_features, model, verbose = True)
    # all features drops the 'ups' category.
    
    pipe.fit(X_train, y_train)

    pipe.predict(X_test)

    
    clf = DummyClassifier()
    scores = cross_val_score(clf, X, y)
    print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    
    scores = cross_val_score(pipe, X, y)
    print("Model classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    
    return pipe

pipes = []
for model in [tree.DecisionTreeClassifier(), LogisticRegression()]: #[LinearRegression(), xgb.XGBRegressor(random_state=2)]:
    X = df.drop(columns=['total_awards_received', 'num_comments'] + award_cols)
    y = df["award_Silver"] > 0 
    pipes.append(run_experiment_classification(model, X, y))


working on DecisionTreeClassifier()
