In [None]:
from utils import load_dataset, home

raw = home / 'data' / 'interim'
raw = load_dataset(raw)

comb = raw['combined']

comb.head(2)

In [None]:
from sklearn.model_selection import TimeSeriesSplit

split = TimeSeriesSplit(n_splits=2)

subsample = -1
comb = comb.iloc[:subsample, :]

for tr_idx, te_idx in split.split(comb):
    tr = comb.iloc[tr_idx, :]
    te = comb.iloc[te_idx, :]

assert tr.shape[1] == te.shape[1]
assert tr.shape[0] + te.shape[0] == comb.shape[0]

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def clean_sample(sample):
    sample = sample.lower()
    sample = sample.replace('/n', '')
    sample = sample.replace("\'", '')
    doc = nlp(sample)
    lemmas = [token.lemma_ for token in doc if (not token.is_stop and not token.is_punct and not token.is_space)]
    return ' '.join(lemmas)

In [None]:
def train_pipe(combined):
    corpus = combined.iloc[:, :-1]
    corpus = corpus.agg(' '.join, axis=1)
    corpus = corpus.apply(clean_sample)
    corpus = corpus.to_frame()
    corpus.columns = ['news']
    target = combined.loc[:, 'final-label'].to_frame()
    target.columns = ['target']
    return corpus, target

x_tr, y_tr = train_pipe(tr)
x_te, y_te = train_pipe(te)

In [None]:
np.mean(y_tr)

In [None]:
np.mean(y_te)

## tidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

def fit(mdl, x_tr, y_tr, x_te, y_te, vec=None):
    if vec:
        x_tr = vec.fit_transform(x_tr.loc[:, 'news'])
        x_te = vec.transform(x_te.loc[:, 'news'])
        
    y_tr = y_tr.values.flatten()
    y_te = y_te.values.flatten()
        
    mdl.fit(x_tr, y_tr)
    
    res = {
        'tr-score': mdl.score(x_tr, y_tr),
        'te-score': mdl.score(x_te, y_te),
        'avg-te-pred': np.mean(mdl.predict(x_te))
    }
    
    for k, v in res.items():
        print(k, v)
    
    return mdl, res

rf = fit(
    RandomForestClassifier(n_estimators=500, max_features=5),
    x_tr, y_tr, x_te, y_te, vec=TfidfVectorizer()
)

In [None]:
rf = fit(
    LogisticRegression(C=5),
    x_tr, y_tr, x_te, y_te, vec=TfidfVectorizer()
)

## doc2vec

In [None]:
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords

def clean_strings(docs):
    tokens = []
    for doc in docs:
        doc = remove_stopwords(doc)
        tokens.append(list(tokenize(doc, lower=True)))
    return tokens
        
tr_tokens = clean_strings(x_tr.loc[:, 'news'].values)
te_tokens = clean_strings(x_te.loc[:, 'news'].values)

In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tr_tokens)]
model = Doc2Vec(documents, vector_size=32, window=3, min_count=1, workers=4, verbose=1)

def get_doc_vecs(docs, model):
    vecs = []
    for sample in docs:
        vecs.append(model.infer_vector(sample))
    return np.array(vecs)

In [None]:
tr_vecs = get_doc_vecs(tr_tokens, model)
te_vecs = get_doc_vecs(te_tokens, model)

In [None]:
rf = fit(
    RandomForestClassifier(n_estimators=1000, max_features=5, max_depth=5),
    tr_vecs, y_tr, te_vecs, y_te
)

## time series

In [None]:
tr_dates = pd.get_dummies(pd.to_datetime(x_tr.index).dayofweek)
te_dates = pd.get_dummies(pd.to_datetime(x_te.index).dayofweek)

rf = fit(
    RandomForestClassifier(n_estimators=1000, max_features=5),
    tr_dates, y_tr, te_dates, y_te
)

## Senitment & subjectivity

In [None]:
from textblob import TextBlob
def sentiment(row):
    return TextBlob(row).sentiment.polarity

def subjectivity(row):
    return TextBlob(row).sentiment.subjectivity

def add_sum(df):
    df.loc[:, 'sent'] = df.loc[:, 'news'].apply(sentiment)
    df.loc[:, 'subj'] = df.loc[:, 'news'].apply(subjectivity)
    return df

from textblob import TextBlob

def sentiment(row):
    return TextBlob(row).sentiment.polarity

def subjectivity(row):
    return TextBlob(row).sentiment.subjectivity

x_s_tr = add_sum(x_tr).drop('news', axis=1)
x_s_te = add_sum(x_te).drop('news', axis=1)

rf = fit(
    RandomForestClassifier(n_estimators=500, max_features=None, max_depth=3),
    x_s_tr, y_tr, x_s_te, y_te
)

## entity extraction 

In [None]:
def find_entities(sample):
    doc = nlp(sample)

    ents = []
    for token in doc:
        if token.pos_ == 'PROPN' and token.tag_ == 'NNP':
            ents.append(token.text)

    return np.array(ents).reshape(1, -1)

def generate_ents(df):
    tokens = []
    for row in range(df.shape[0]):
        sample = df.iloc[row, :].loc['news']
        tokens.append(find_entities(sample))
        
    assert len(tokens) == df.shape[0]
    return tokens

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def extract_ents_as_str(ents):
    only_str = []
    for row in ents:
        only_str.append(" ".join(row.flatten().tolist()))
    return only_str

tr_ents = generate_ents(x_tr)
te_ents = generate_ents(x_te)

tr_ents = extract_ents_as_str(tr_ents)
te_ents = extract_ents_as_str(te_ents)

enc = CountVectorizer()
tr_ents = enc.fit_transform(tr_ents)
te_ents = enc.transform(te_ents)

In [None]:
rf = fit(
    RandomForestClassifier(n_estimators=500, max_features=None, max_depth=4),
    tr_ents, y_tr, te_ents, y_te
)