In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from collections import Counter
from itertools import chain
import re
from string import Template
import ftfy

import numpy as np
import pandas as pd
import spacy

In [None]:
from src.data_utils import create_news_stats_dataset, create_inverted_news_dict, get_teams
from src.spacy_utils import load_spacy_model
from src.generate_templates import GenerateTemplates, get_context, get_context_tags

In [None]:
nlp = load_spacy_model('../../data/teams_aliases.txt', '../../data/player_news.csv')

In [None]:
#news_stats_df = create_news_stats_dataset('../../data/player_news.csv', '../../data/football_db_player_stats.csv',
#                                          '../../data/news_and_stats.csv')
news_stats_df = pd.read_csv('../../data/news_and_stats.csv')

template_generator = GenerateTemplates(nlp, '../../data/teams_aliases.txt', vectorizer=None, clf=None)

In [None]:
news_stats_df.head(2)

In [None]:
token_training_set = template_generator.create_training_data(news_stats_df, '../../data/intermediate_templates.csv')

In [None]:
context_ngrams = []
context_tags = []
true_tags = []

for sample in token_training_set:
    # context ngrams
    bigrams = []
    for bigram in sample[1]:
        if len(bigram) != 0:
            bigrams.append(' '.join(bigram))
    context_ngrams.append(bigrams)
    
    # context tags
    tags = []
    for tag in sample[2]:
        if tag is not None:
            tags.append(tag)
    context_tags.append(tags)
    
    # True tag
    true_tags.append(sample[-1])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda d: d, analyzer=lambda d: d)
tag_vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda d: d, analyzer=lambda d: d)

In [None]:
X_ngrams = ngram_vectorizer.fit_transform(context_ngrams)
X_tags = tag_vectorizer.fit_transform(context_tags)

y_tags = np.array([template_generator.data_col_idx[i] for i in true_tags])
print(X_ngrams.shape, X_tags.shape, y_tags.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB

ngram_clf = MultinomialNB(alpha=0.5)
ngram_clf.fit(X_ngrams, y_tags)
predictions = ngram_clf.predict(X_ngrams)
print('Acc:', np.sum(predictions == y_tags) / len(y_tags))

tag_clf = MultinomialNB(alpha=0.5)
tag_clf.fit(X_tags, y_tags)
predictions = tag_clf.predict(X_tags)
print('Acc:', np.sum(predictions == y_tags) / len(y_tags))

In [None]:
y_true = pd.Series([template_generator.idx_data_col[i] for i in y_tags], name='Actual')
y_pred = pd.Series([template_generator.idx_data_col[i] for i in predictions], name='Predicted')
pd.crosstab(y_true, y_pred)

In [None]:
template_generator.create_prediction_func(ngram_vectorizer, ngram_clf)

In [None]:
template_generator.template_transformer(news_stats_df, '../../data/output_templates.csv')

In [None]:
temp_df = pd.read_csv('../../data/output_templates.csv')
placeholders_qb = Counter()
placeholders_rb = Counter()
placeholders_wr = Counter()
placeholders_te = Counter()


for row in temp_df.itertuples():
    placeholders_used = [i[2] for i in re.findall(Template(row.templates).pattern, row.templates)]
    placeholders_used = list(set(placeholders_used))
    placeholders_used.sort()
    placeholders_used = str(placeholders_used)
    
    if row.player_position == 'QB':
        placeholders_qb[placeholders_used] += 1
    elif row.player_position == 'RB':
        placeholders_rb[placeholders_used] += 1
    elif row.player_position == 'WR':
        placeholders_wr[placeholders_used] += 1
    elif row.player_position == 'TE':
        placeholders_te[placeholders_used] += 1