# Query NFL Team Article Database and Run Queries

In [1]:
# Packages for NFL Database

import psycopg2
import config as config
import teamQuery as tq

In [6]:
len(tq.teamQueryCounts("Falcons"))

286

In [8]:
def quick_team_corpus(team):
    return [text[3] for text in tq.teamQueryTexts(team)]

In [9]:
red = quick_team_corpus("Redskins")

In [10]:
len(red)

411

In [14]:
# Packages for LDA

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer


In [57]:
# Initializations

nltk.download('wordnet')

n_features = 1000
n_topics = 10
n_top_words = 5

base_tokenizer = CountVectorizer().build_tokenizer()
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

tokenize = None
#tokenize = lambda text: [stemmer.stem(item) for item in base_tokenizer(text)]
#tokenize = lambda text: [lemmatizer.lemmatize(item) for item in base_tokenizer(text)]


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anthony\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [37]:
# The print top words function
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message+"\n")
    print()
        

In [50]:
# Get the Samples for a team
team = "Ravens"
samples = quick_team_corpus(team)

In [51]:
# TF-IDF Vectorization With Prepocessing
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=.95,
                                   min_df=2,
                                   max_features=n_features,
                                   ngram_range=(1,3),
                                   tokenizer=tokenize,
                                   stop_words='english')

t0 = time()
tfidf = tfidf_vectorizer.fit_transform(samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 3.278s.


In [52]:
# NMF Model Fit
print("Fitting the NMF model with tf-idf freatures,  "
      'n_samples=%d and n_features=%d...'
      % (len(samples), n_features))

t0 = time()

nmf = NMF(n_components=n_topics,
          random_state=1,
          alpha=.1,
          l1_ratio=.5)

nmfResult = nmf.fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names()

Fitting the NMF model with tf-idf freatures,  n_samples=315 and n_features=1000...
done in 0.173s.


In [53]:
# Show the Topics
print("Topics in NMF model:")
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topics in NMF model:
Topic #0: game, wa, just, defense, said

Topic #1: williams, injury, practice, week, defensive

Topic #2: preseason, woodrum, ricard, roster, bradley

Topic #3: langford, practice squad, squad, woodhead, practice

Topic #4: london, fan, trip, nfl, team

Topic #5: purple, pm, fan, mt, mt bank

Topic #6: wallace, flacco, perriman, maclin, receiver

Topic #7: bergstrom, yanda, guard, offensive, traded

Topic #8: collins, carry, school, yard, carl

Topic #9: carr, raider, smith, humphrey, manuel




In [58]:
# TF Vectorization - raw term counts with Preprocessing
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=.95,
                                min_df=2,
                                ngram_range=(1,3),
                                max_features=n_features,
                                tokenizer=tokenize,
                                stop_words='english')

t0 = time()
tf = tf_vectorizer.fit_transform(samples)
print('done in %0.3fs.' % (time() - t0))

Extracting tf features for LDA...
done in 1.580s.


In [59]:
print("Fitting the LDA model  with tf freatures,  "
      'n_samples=%d and n_features=%d...'
      % (len(samples), n_features))

t0 = time()

lda = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0,
                                )

lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

tf_feature_names = tf_vectorizer.get_feature_names()

Fitting the LDA model  with tf freatures,  n_samples=315 and n_features=1000...
done in 2.587s.


In [60]:
print("Topics in LDA model:")
print_top_words(lda, tf_feature_names, n_top_words)

Topics in LDA model:
Topic #0: game, like, just, said, think

Topic #1: injury, flacco, baltimore, team, coach

Topic #2: injury, practice, williams, week, game

Topic #3: hurst, guard, tackle, left, said

Topic #4: baltimore, school, community, game, city

Topic #5: purple, fans, cancer, stadium, game

Topic #6: going, play, said, good, just

Topic #7: preseason, woodrum, team, game, nfl

Topic #8: foster, preseason, said, game, football

Topic #9: game, just, said, going, play


