In [1]:
import numpy as np
import pandas as pd
from os import getcwd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pickle
import warnings
import multiprocessing 


In [2]:
def set_path():
    abspath = getcwd()
    dname = os.path.dirname(abspath)
    os.chdir(dname)



In [3]:
# Vetorize content (abstract + title) by *authorId*
def model_nlp(df, authId):
    df_auth = df[df['authId_enc'] == authId]
    auth_corpus = df_auth['content'].tolist() 

    # vectorize literature style
    vectorizer = TfidfVectorizer(input='content')
    auth_NLP = vectorizer.fit_transform(auth_corpus)
    return auth_NLP, vectorizer

# get estimates from categorical variables (year and venue) for each author by random forest


def model_place(df, authId, max_depth = 30):
    cats = df[['year', 'venues_le']]
    target = df['authId_enc'] == authId
    clf = GradientBoostingClassifier(max_depth = max_depth)
    clf = clf.fit(cats, target)

    return clf # prob of pair of cats by author





In [4]:
def author_enc(df, authId):
    nlp, vectorizer = model_nlp(df, authId)
    place = model_place(df, authId)

    merged = [(nlp,vectorizer), place]

    # write authorId encoded into file:
    with open(f"models_TF/{authId}.pkl", 'wb') as f:
        pickle.dump(file=f, obj=merged)


def encoding_all():
    set_path()
    df = pd.read_pickle('data/processed/train_clean_df.pkl')
    authIds = df['authId_enc'].unique()
    for authId in authIds:
        author_enc(df, authId)

encoding_all()
