In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from collections import Counter, namedtuple
from itertools import chain
import ftfy
import pickle
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from string import Template

import numpy as np
import pandas as pd
import spacy

In [None]:
from fantasy_nlg.data_utils import create_news_stats_dataset, create_inverted_news_dict, get_teams
from fantasy_nlg.spacy_utils import load_spacy_model
from fantasy_nlg.generate_templates import GenerateTemplates, get_context, get_context_tags, text_normalization, record_features
from fantasy_nlg.news_nlg import NewsGenerator

In [None]:
nlp = load_spacy_model('../../data/teams_aliases.txt', '../../data/player_news.csv')

In [None]:
#news_stats_df = create_news_stats_dataset('../../data/player_news.csv', '../../data/football_db_player_stats.csv',
#                                          '../../data/news_and_stats.csv')
news_stats_df = pd.read_csv('../../data/news_and_stats.csv')
news_stats_df = news_stats_df[lambda df: df['week'] < 13]

In [None]:
TemplateModel = namedtuple('TemplateModel', ['vectorizer', 'classifier'])
def noop(d):
    return d

with open('../../models/ngram_nb.pkl', 'rb') as f:
    TemplateModel = pickle.load(f)

In [None]:
template_generator = GenerateTemplates(nlp, '../../data/teams_aliases.txt', vectorizer=TemplateModel.vectorizer, clf=TemplateModel.classifier)

In [None]:
#_ = template_generator.template_transformer(news_stats_df, '../../data/temp_templates.csv')

In [None]:
_, chunk_training_dict = template_generator.template_transformer(news_stats_df, '../../data/temp_templates.csv', chunking=True)

In [None]:
# Remove infrequent template chunks
teams = get_teams('../../data/teams_aliases.txt')

for k, v in chunk_training_dict.items():
    if k in ['passing', 'rushing', 'receptions', 'game']:
        large_counts = Counter(v['y'])
        large_counts = [temp for temp, cnt in large_counts.items() if cnt > 2]
        large_filter = [(y in large_counts) for y in v['y']]
        y_temp = np.row_stack(v['y'])[large_filter]
        X_temp = np.row_stack(v['X'])[large_filter]
        chunk_training_dict[k] = {'X': X_temp, 'y': y_temp}
    elif k == 'game':
        large_counts = Counter(v['y'])
        large_counts = [temp for temp, cnt in large_counts.items() if cnt > 2]
        
        large_filter = []
        for y in v['y']:
            team_present = False
            for team in teams:
                if team in y:
                    team_present = True
                    break
            large_filter.append(y in large_counts and not team_present)
        
        y_temp = np.row_stack(v['y'])[large_filter]
        X_temp = np.row_stack(v['X'])[large_filter]
        chunk_training_dict[k] = {'X': X_temp, 'y': y_temp}
    else:
        chunk_training_dict[k] = {'X': np.row_stack(v['X']), 'y': np.row_stack(v['y'])}

In [None]:
for k, v in chunk_training_dict.items():
    print(k, v['X'].shape, v['y'].shape)

# Record generation model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import OneHotEncoder

#record_type_enc = OneHotEncoder()
#record_type_enc.fit_transform(record_y)

In [None]:
test_data = pd.read_csv('../../data/template_test_data.csv')

In [None]:
def chunk_gold_standard(df):
    gold_chunks = []
    for row in df.iterrows():
        news_dict = row[1].to_dict()
        inverted_news_dict = create_inverted_news_dict(news_dict, 
                                                       template_generator.data_cols, 
                                                       template_generator.team_id_dict,
                                                       template_generator.id_team_dict)

        normalized_text = text_normalization(news_dict['report'])

        doc = nlp(normalized_text)

        doc = template_generator.template_tagging(doc, inverted_news_dict, training=False)
        news_template = template_generator.doc_to_template(doc)
        gold_chunks.append(['START'] + [c.label_ for c in template_generator.chunker(doc)] + ['END'])
    
    return gold_chunks

gold_chunks = chunk_gold_standard(test_data)

In [None]:
def levenshtein_dist(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

def performance_measure(predictions, actuals):
    dist_sum = 0
    count = 0
    for predicted, actual in zip(predictions, actuals):
        count += 1
        dist_sum += levenshtein_dist(predicted, actual)

    print('Total:', dist_sum)
    print('Avg:', dist_sum / count)
    return dist_sum / count

In [None]:
record_clf = LogisticRegression(C=10.0, max_iter=1000, multi_class='multinomial', solver='lbfgs', random_state=42)

In [None]:
record_clf.fit(chunk_training_dict['record']['X'], chunk_training_dict['record']['y'].ravel())
preds = record_clf.predict(chunk_training_dict['record']['X'])

print(accuracy_score(chunk_training_dict['record']['y'].ravel(), preds))
print(f1_score(chunk_training_dict['record']['y'].ravel(), preds, average='macro'))

In [None]:
# Predict recard sequences
pred_records = []

for row in test_data[template_generator.data_cols].iterrows():
    news_dict = row[1].to_dict()
    record_list = ['START']
    count = 0
    while record_list[-1] != 'END' and count < 10:
        count += 1
        features = record_features(record_list, news_dict, template_generator.record_types, template_generator.data_cols)
        record_list.append(record_clf.predict(features.reshape(1, -1))[0])
    pred_records.append(record_list)

In [None]:
performance_measure(pred_records, gold_chunks)

In [None]:
# Save best performing model
with open('../../models/record_selection_lr.pkl', 'wb') as f:
    pickle.dump(record_clf, f)

# Template Choice

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# Template data
template_X = np.concatenate((chunk_training_dict['game']['X'], chunk_training_dict['passing']['X'], chunk_training_dict['receptions']['X'], 
                             chunk_training_dict['rushing']['X']))
template_y = np.concatenate((chunk_training_dict['game']['y'], chunk_training_dict['passing']['y'], chunk_training_dict['receptions']['y'], 
                             chunk_training_dict['rushing']['y']))

shuffle = np.random.permutation(template_y.shape[0])
template_X = template_X[shuffle]
template_y = template_y[shuffle]
print(template_X.shape, template_y.shape)

In [None]:
# KNN
parameters = {'n_neighbors':[1, 3, 5, 10, 15], 'weights':('uniform', 'distance')}
knn_clf = KNeighborsClassifier()
clf = GridSearchCV(knn_clf, parameters, cv=5)
clf.fit(template_X, template_y.ravel())
clf.best_params_, clf.best_score_

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=15, weights='distance')
knn_clf.fit(template_X, template_y.ravel())
preds = knn_clf.predict(template_X)

print(accuracy_score(template_y.ravel(), preds))
print(f1_score(template_y.ravel(), preds, average='macro'))

In [None]:
# Random Forest
parameters = {'n_estimators':[100, 300, 1000, 3000], 'min_samples_leaf':[1, 3, 5]}
rf_clf = RandomForestClassifier(random_state=42)
clf = GridSearchCV(rf_clf, parameters, cv=5)
clf.fit(template_X, template_y.ravel())
clf.best_params_, clf.best_score_

In [None]:
# Random forest
rf_clf = RandomForestClassifier(n_estimators=3000, min_samples_leaf=3, random_state=42)
rf_clf.fit(template_X, template_y.ravel())
preds = rf_clf.predict(template_X)

print(accuracy_score(template_y.ravel(), preds))
print(f1_score(template_y.ravel(), preds, average='macro'))

In [None]:
def get_data_col_types(text):
    template_regex = re.compile(r'\$\{([_a-z][_a-z0-9]*)\}')
    tags = re.findall(template_regex, text)
    try:
        return template_generator.data_col_to_type[tags[0]]
    except KeyError:
        return None
    
def record_type_mask(classes):
    record_types = []
    for cls in classes:
        record_types.append(get_data_col_types(cls))
    return record_types

record_types = record_type_mask(rf_clf.classes_)
record_types = np.array(record_types)

In [None]:
preds = rf_clf.predict_proba(template_X)

tmplt_out = []
for pred, rt in zip(preds, record_type):
    record_mask = np.where(record_types == rt, 1.0, 0.0)
    tmplt_out.append(rf_clf.classes_[np.argmax(pred * record_mask)])

In [None]:
template_clf = LogisticRegression(C=10.0, max_iter=1000, multi_class='multinomial', solver='lbfgs', random_state=42)

In [None]:
asdf_clf = KNeighborsClassifier(n_neighbors=15, weights='distance')
asdf_clf.fit(template_X, template_y.ravel())
preds = asdf_clf.predict(template_X)

print(accuracy_score(template_y.ravel(), preds))
print(f1_score(template_y.ravel(), preds, average='macro'))

In [None]:
# Save best performing model
with open('../../models/template_selection_knn.pkl', 'wb') as f:
    pickle.dump(knn_clf, f)

# Final generation

In [None]:
test_data = pd.read_csv('../../data/template_test_data.csv')

In [None]:
# Unpickle models
with open('../../models/record_selection_lr.pkl', 'rb') as f:
    record_clf = pickle.load(f)

with open('../../models/template_selection_knn.pkl', 'rb') as f:
    template_clf = pickle.load(f)

In [None]:
news_generator = NewsGenerator('../../data/teams_aliases.txt', record_clf, template_clf)

In [None]:
record_output, template_output, news_output = news_generator.doc_processing(test_data)

In [None]:
output_df = pd.DataFrame({'record': record_output, 'template': template_output, 'news_update': news_output})
output_df.to_csv('../../data/test_news_updates.csv', index=False)

# WIP

In [None]:
news_dict = news_stats_df.loc[550].to_dict()
inverted_news_dict = create_inverted_news_dict(news_dict, 
                                               template_generator.data_cols, 
                                               template_generator.team_id_dict,
                                               template_generator.id_team_dict)

normalized_text = text_normalization(news_dict['report'])

doc = nlp(normalized_text)

doc = template_generator.template_tagging(doc, inverted_news_dict, training=False)
news_template = template_generator.doc_to_template(doc)

In [None]:
doc

In [None]:
for token in doc:
    print(token, token.pos_, token.ent_type_)

In [None]:
template_generator.chunker(doc)

In [None]:
len(template_generator.data_cols)

In [None]:
output_templates = pd.read_csv('../../data/nbmodel_templates.csv')
output_templates.rename(index=str, columns={'reports': 'report'}, inplace=True)
asdf = pd.merge(news_stats_df, output_templates, how='right', on='report')

In [None]:
# Unique chunks
get_data_col_types('${player_name}')

In [None]:
nb_output = pd.read_csv('../../data/nbmodel_templates.csv')

chunk_dict = dict()
for chunks in nb_output['template_chunks']:
    for chunk in eval(chunks):
        chunk_type = get_data_col_types(chunk)
        if chunk_type not in chunk_dict:
            chunk_dict[chunk_type] = []
        #print(chunk_dict[chunk_type], '\t', chunk_type, '\t', chunk_dict[chunk_type].append(chunk))
        chunk_dict[chunk_type].append(chunk)

In [None]:
chunk_counters = dict()
for chunk_type in chunk_dict.keys():
    chunk_counters[chunk_type] = Counter(chunk_dict[chunk_type])

In [None]:
list(chunk_counters.keys())

In [None]:
print(len(chunk_counters['game'].keys()))
count = 0
num_samples = 0
for k, v in chunk_counters['game'].items():
    if v > 2:
        count += 1
        num_samples += v
        print(k, ':', v)
print(count, num_samples)
#chunk_counters['game']