In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from collections import Counter, namedtuple
from itertools import chain
import pickle
import re
from string import Template
import ftfy

import numpy as np
import pandas as pd
import spacy

In [None]:
from fantasy_nlg.data_utils import create_news_stats_dataset, create_inverted_news_dict, get_teams
from fantasy_nlg.spacy_utils import load_spacy_model
from fantasy_nlg.generate_templates import GenerateTemplates, get_context, get_context_tags, text_normalization

In [None]:
nlp = load_spacy_model('../../data/teams_aliases.txt', '../../data/player_news.csv')

In [None]:
#news_stats_df = create_news_stats_dataset('../../data/player_news.csv', '../../data/football_db_player_stats.csv',
#                                          '../../data/news_and_stats.csv')
news_stats_df = pd.read_csv('../../data/news_and_stats.csv')

template_generator = GenerateTemplates(nlp, '../../data/teams_aliases.txt', vectorizer=None, clf=None)

# Create template disambiguation test data
Perform exact matching on week 13 news. Manually replace temp_var_* tags with the correct tag.  

Remove all news entries not related to game performance summary or that contain information we couldn't know.  
* ex: injury updates  

If a token should be tagged but isn't then tag it  
* ex: NER incorrectly parses team name  
* ex: data says rec_yards=4, but text says "3 yard reception"  

In [None]:
news_stats_df = pd.read_csv('../../data/news_and_stats.csv')
test_data = news_stats_df[lambda df: df['week'] == 13]

In [None]:
_ = template_generator.create_training_data(test_data, '../../data/intermediate_templates.csv')

In [None]:
# Are all temp_var_* tags replaced? Are there any mispelled tags?
test_df = pd.read_csv('../../data/intermediate_templates_test_john.csv')

for row, temp_str in enumerate(test_df['templates']):
    template = Template(temp_str)
    tags = [i[2] for i in re.findall(template.pattern, template.template)]
    for tag in tags:
        if tag not in template_generator.data_cols:
            print('Row {}, tag {}, full text "{}"'.format(row, tag, template.template))

In [None]:
test_df = pd.read_csv('../../data/intermediate_templates_test_john.csv')
test_news_stats_df = pd.merge(news_stats_df, test_df, how='right', on='report')
test_news_stats_df.to_csv('../../data/template_test_data.csv', index=False)

## Performance measure
Levenshtein distance between templates split on slot/non-slot boundaries

In [None]:
def levenshtein_dist(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

In [None]:
def performance_measure(predictions, actuals):
    template_split = re.compile(r'\$\{([_a-z][_a-z0-9]*)\}')
    dist_sum = 0
    count = 0
    for predicted, actual in zip(predictions, actuals):
        count += 1
        dist_sum += levenshtein_dist(
            re.split(template_split, predicted),
            re.split(template_split, actual)
        )

    print('Total:', dist_sum)
    print('Avg:', dist_sum / count)
    return dist_sum / count

# Example:    performance_measure(other_df['templates'], test_news_stats_df['templates'])

## Baseline
Exact match and choose randomly when ambiguous

In [None]:
import random

In [None]:
def baseline_predictor(token, tag_choice):
    return random.choice(tag_choice)

baseline_generator = GenerateTemplates(nlp, '../../data/teams_aliases.txt', vectorizer=None, clf=None)
baseline_generator.prediction_func = baseline_predictor

In [None]:
test_news_stats_df = pd.read_csv('../../data/template_test_data.csv')
actual_templates = test_news_stats_df['templates']
test_news_stats_df.drop(columns='templates', inplace=True)

In [None]:
baseline_generator.template_transformer(test_news_stats_df, '../../data/baseline_templates.csv')
performance_measure(
    pd.read_csv('../../data/baseline_templates.csv')['templates'], 
    actual_templates
)

# Create training data

In [None]:
news_stats_df = pd.read_csv('../../data/news_and_stats.csv')
news_stats_df = news_stats_df[lambda df: df['week'] < 13]

In [None]:
print(news_stats_df.shape)
news_stats_df.head(2)

In [None]:
token_training_set = template_generator.create_training_data(news_stats_df, '../../data/intermediate_templates.csv')

In [None]:
context_ngrams = []
context_tags = []
true_tags = []

for sample in token_training_set:
    # context ngrams
    bigrams = []
    for bigram in sample[1]:
        if len(bigram) != 0:
            bigrams.append(' '.join(bigram))
    context_ngrams.append(bigrams)
    
    # context tags
    tags = []
    for tag in sample[2]:
        if tag is not None:
            tags.append(tag)
    context_tags.append(tags)
    
    # True tag
    true_tags.append(sample[-1])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def noop(d):
    return d

ngram_vectorizer = CountVectorizer(lowercase=False, tokenizer=noop, analyzer=noop)
tag_vectorizer = CountVectorizer(lowercase=False, tokenizer=noop, analyzer=noop)

In [None]:
X_ngrams = ngram_vectorizer.fit_transform(context_ngrams)
X_tags = tag_vectorizer.fit_transform(context_tags)

y_tags = np.array([template_generator.data_col_idx[i] for i in true_tags])
print(X_ngrams.shape, X_tags.shape, y_tags.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB

ngram_clf = MultinomialNB(alpha=0.5)
ngram_clf.fit(X_ngrams, y_tags)
predictions = ngram_clf.predict(X_ngrams)
print('Acc:', np.sum(predictions == y_tags) / len(y_tags))

tag_clf = MultinomialNB(alpha=0.5)
tag_clf.fit(X_tags, y_tags)
predictions = tag_clf.predict(X_tags)
print('Acc:', np.sum(predictions == y_tags) / len(y_tags))

In [None]:
y_true = pd.Series([template_generator.idx_data_col[i] for i in y_tags], name='Actual')
y_pred = pd.Series([template_generator.idx_data_col[i] for i in predictions], name='Predicted')
pd.crosstab(y_true, y_pred)

In [None]:
template_generator.create_prediction_func(ngram_vectorizer, ngram_clf)

In [None]:
_ = template_generator.template_transformer(news_stats_df, '../../data/nbmodel_templates.csv')

In [None]:
# Save model with Pickle
TemplateModel = namedtuple('TemplateModel', ['vectorizer', 'classifier'])
# Need to copy noop() function from vectorizer when unpickling (see vectorizer args: tokenizer, analyzer)
with open('../../models/ngram_nb.pkl', 'wb') as f:
    pickle.dump(TemplateModel(ngram_vectorizer, ngram_clf), f)

### Test data

In [None]:
TemplateModel = namedtuple('TemplateModel', ['vectorizer', 'classifier'])
def noop(d):
    return d

with open('../../models/ngram_nb.pkl', 'rb') as f:
    TemplateModel = pickle.load(f)
    
template_generator.create_prediction_func(TemplateModel.vectorizer, TemplateModel.classifier)

In [None]:
test_news_stats_df = pd.read_csv('../../data/template_test_data.csv')
actual_templates = test_news_stats_df['templates']
test_news_stats_df.drop(columns='templates', inplace=True)

In [None]:
out_templates = template_generator.template_transformer(test_news_stats_df)

In [None]:
performance_measure(
    #pd.read_csv('../../data/output_templates.csv')['templates'],
    out_templates,
    actual_templates
)

# Analysis

In [None]:
temp_df = pd.read_csv('../../data/output_templates.csv')
placeholders_qb = Counter()
placeholders_rb = Counter()
placeholders_wr = Counter()
placeholders_te = Counter()


for row in temp_df.itertuples():
    placeholders_used = [i[2] for i in re.findall(Template(row.templates).pattern, row.templates)]
    placeholders_used = list(set(placeholders_used))
    placeholders_used.sort()
    placeholders_used = str(placeholders_used)
    
    if row.player_position == 'QB':
        placeholders_qb[placeholders_used] += 1
    elif row.player_position == 'RB':
        placeholders_rb[placeholders_used] += 1
    elif row.player_position == 'WR':
        placeholders_wr[placeholders_used] += 1
    elif row.player_position == 'TE':
        placeholders_te[placeholders_used] += 1

### Random baseline (at tag level)

In [None]:
random_correct = 0
num_tag_ambig = 0
num_tag_unambig = 0
num_docs = 0
num_doc_ambig = 0
num_doc_unambig = 0
tag_conflicts = Counter()
unmatched_num = 0

for idx, doc, news_dict, inverted_news_dict in template_generator.doc_preprocessing(news_stats_df):
    num_docs += 1
    ambig_doc = False
    unambig_doc = False
    
    for token in doc:
        if token.text in inverted_news_dict:
            if type(inverted_news_dict[token.text]) is list:
                random_correct += 1 / len(inverted_news_dict[token.text])
                num_tag_ambig += 1
                ambig_doc = True
                tag_conflicts[token.text] += 1
            else:
                random_correct += 1
                num_tag_unambig += 1
                unambig_doc = True
        elif token.pos_ == 'NUM':
            unmatched_num += 1
    
    if ambig_doc:
        num_doc_ambig += 1
    if unambig_doc:
        num_doc_unambig += 1

print('Total tags: {}, Ambiguous tags: {}, Random accuracy: {}'.format(
    num_tag_ambig + num_tag_unambig, num_tag_ambig, random_correct / (num_tag_ambig + num_tag_unambig)
))
print('Total docs: {}, Docs with ambiguous tags: {}, Percent with ambiguous tags: {}'.format(
    num_docs, num_doc_ambig, num_doc_ambig / num_docs
))