In [None]:
import numpy as np
import pandas as pd

In [None]:
players = pd.read_csv('../../data/player_list.csv')
weekly_stats = pd.read_csv('../../data/player_stats.csv')
news = pd.read_csv('../../data/player_news.csv')

In [None]:
# Put all player stats and the corresponding news article in one row
full_df = players.drop(columns='player_link')
temp_df = news.rename(index=str, columns={'date': 'news_date'})
temp_df['news_date'] = pd.to_datetime(temp_df['news_date']).dt.date
temp_df = temp_df.drop(columns=['player', 'position', 'url'])
full_df = pd.merge(full_df, temp_df, on='player_id')
temp_df = weekly_stats.rename(index=str, columns={'date': 'stats_date'})
temp_df['stats_date'] = pd.to_datetime(temp_df['stats_date']).dt.date
temp_df = temp_df.drop(columns='player')
full_df = pd.merge(full_df, temp_df, left_on=['player_id', 'news_date'], right_on=['player_id', 'stats_date'])
full_df = full_df.drop(columns='stats_date')

In [None]:
print(full_df.shape)

In [None]:
full_df.to_csv('../../data/news_and_stats.csv', index=False)

In [None]:
full_df = pd.read_csv('../../data/news_and_stats.csv')

# Processing

### String preprocessing

In [None]:
from itertools import chain
import re
from string import Template
import ftfy

In [None]:
number_word_dict = {'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 
                    'ten': '10', 'eleven': '11', 'twelve': '12', 'thirteen': '13', 'fouteen': '14', 'fifteen': '15', 'sixteen': '16', 
                    'seventeen': '17', 'eighteen': '18', 'nineteen': '19', 'twenty': '20', 'thirty': '30', 'fourty': '40', 'fifty': '50', 
                    'sixty': '60', 'seventy': '70', 'eighty': '80', 'ninety': '90'}

def number_words_repl(match):
    return number_word_dict[match.group(0)]

In [None]:
data_cols = ['player_name', 'player_position', 'team', 'week', 'opp', 'away_game', 
             'pass_attempts', 'pass_completions', 'pass_percent', 'pass_yards', 'pass_ya', 'pass_td', 'pass_int', 
             'rush_attempts', 'rush_yards', 'rush_avg', 'rush_td', 
             'reception', 'rec_yards', 'rec_avg', 'rec_td', 
             'fumb_lost', 'ko_ret_td', 'ko_ret_yards', 'punt_ret_td', 'punt_ret_yards']

### Spacy processing

In [None]:
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token


class NFLTeamRecognizer(object):
    name = 'nfl_teams'
    
    def __init__(self, nlp, teams=tuple(), label='NFL_TEAM'):
        if label in nlp.vocab.strings:
            self.label = nlp.vocab.strings[label]
        else:
            nlp.vocab.strings.add(label) 
            self.label = nlp.vocab.strings[label]
        patterns = [nlp(team) for team in teams]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('NFL_TEAMS', None, *patterns)
        
        Token.set_extension('is_nfl_team', default=False)
        Doc.set_extension('has_nfl_team', getter=self.has_nfl_team)
        Span.set_extension('has_nfl_team', getter=self.has_nfl_team)
        
    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for _, start, end in matches:
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            for token in entity:
                token._.set('is_nfl_team', True)
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            span.merge()
        return doc
    
    def has_nfl_team(self, tokens):
        return any([t._.get('is_nfl_team') for t in tokens])
    

class NFLPlayerRecognizer(object):
    name = 'nfl_players'
    
    def __init__(self, nlp, players=tuple(), label='NFL_PLAYER'):
        if label in nlp.vocab.strings:
            self.label = nlp.vocab.strings[label]
        else:
            nlp.vocab.strings.add(label) 
            self.label = nlp.vocab.strings[label]
        patterns = [nlp(player) for player in players]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('NFL_PLAYERS', None, *patterns)
        
        Token.set_extension('is_nfl_player', default=False)
        Doc.set_extension('has_nfl_player', getter=self.has_nfl_player)
        Span.set_extension('has_nfl_player', getter=self.has_nfl_player)
        
    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for _, start, end in matches:
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            for token in entity:
                token._.set('is_nfl_player', True)
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            span.merge()
        return doc
    
    def has_nfl_player(self, tokens):
        return any([t._.get('is_nfl_player') for t in tokens])

In [None]:
import spacy

nlp = spacy.load('en')

In [None]:
# Teams
with open('../../data/teams_aliases.txt') as f:
    team_dict = eval(f.read())
    teams = [v for v in team_dict.values()]
    teams = list(chain(*teams))
    
id_team_dict = dict()
id_val = 0
for k, v in team_dict.items():
    id_team_dict[id_val] = v
    id_val += 1
    
team_id_dict = dict()
for k, v in id_team_dict.items():
    for v_i in v:
        team_id_dict[v_i] = k

# Players
players = pd.read_csv('../../data/player_list.csv')
player_list = players['player_name'].tolist()
player_list.extend([i[-1] for i in players['player_name'].str.split(' ')])
player_list = set(player_list)
    
component = NFLTeamRecognizer(nlp, teams)
nlp.add_pipe(component, last=True)
component = NFLPlayerRecognizer(nlp, player_list)
nlp.add_pipe(component, last=True)

### Sample run

In [None]:
def create_inverted_news_dict(news_dict, data_cols, team_id_dict, id_team_dict):
    inverted_news_dict = dict()
    
    for k, v in news_dict.items():
        if k in data_cols and v is not np.nan:
            if (type(v) is np.float64 or type(v) is float) and v % 1 == 0:
                v = int(v)
            k, v = str(k), str(v)

            if k in ['team', 'opp']:
                team_id = team_id_dict[v]
                team_surface_forms = id_team_dict[team_id]
                for team in team_surface_forms:
                    inverted_news_dict[team] = k
            elif v not in inverted_news_dict:
                inverted_news_dict[v] = k
            elif type(inverted_news_dict[v]) is list:
                inverted_news_dict[v] = inverted_news_dict[v] + [k]
            else:
                inverted_news_dict[v] = [inverted_news_dict[v], k]
                
    return inverted_news_dict

In [None]:
def text_normalization(text, number_word_dict):
    text = ftfy.fix_text(text)
    text = text.replace('-of-', ' of ')
    text = re.sub(r'\bWeek\b', 'week', text)
    text = re.sub(r'\b(' + '|'.join([k for k in number_word_dict.keys()]) + r')\b', 
                  number_words_repl, text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+)-([A-Za-z]+)', r'\1 \2', text)
    text = re.sub(r'(\[A-Za-z]+)-(\d+)', r'\1 \2', text)
    return text

In [None]:
def doc_to_template(doc, inverted_news_dict):
    tagged_text = ''
    ambiguous_placeholder_count = 0
    ambiguous_placeholder_dict = {}

    for token in doc:
        if token.text in inverted_news_dict:
            if type(inverted_news_dict[token.text]) is list:
                tagged_text += '${temp_var_' + str(ambiguous_placeholder_count) + '}'
                ambiguous_placeholder_dict['temp_var_{}'.format(ambiguous_placeholder_count)] = inverted_news_dict[token.text]
                ambiguous_placeholder_count += 1
            else:
                tagged_text += '${' + inverted_news_dict[token.text] + '}'
        else:
            tagged_text += token.text.replace('$', '$$')
        tagged_text += token.text_with_ws[len(token.text):]
        
    return Template(tagged_text), ambiguous_placeholder_dict

In [None]:
news_reports = []
output_templates = []
ambiguous_placeholder_list = []

for row in full_df.iterrows():
    news_dict = row[1].to_dict()
    inverted_news_dict = create_inverted_news_dict(news_dict, data_cols, team_id_dict, id_team_dict)
    
    normalized_text = text_normalization(news_dict['report'], number_word_dict)
                
    doc = nlp(normalized_text)
    for e in doc.ents:
        if e.label_ in ['NFL_PLAYER', 'NFL_TEAM']:
            e.merge()
    
    news_template, ambiguous_placeholders = doc_to_template(doc, inverted_news_dict)
    news_reports.append(news_dict['report'])
    output_templates.append(news_template.template)
    ambiguous_placeholder_list.append(ambiguous_placeholders)
    
pd.DataFrame({'reports': news_reports, 'templates': output_templates}).to_csv('../../data/output_templates.csv', index=False)