In [179]:
import os

DATA_DIR = "baseball_article"

filelist = [os.path.join(DATA_DIR, filename) for filename in os.listdir("baseball_article")]
filelist[:3]

['baseball_article\\1_Dae-Ho Lee walk-off homer gives Mariners 4-2 win over Rangers.txt',
 'baseball_article\\1_Korean First Baseman Dae-Ho Lee Becomes Free Agent, Interested In MLB Deal.txt',
 'baseball_article\\1_Lee Dae-ho Announces MLB Aspirations.txt']

In [180]:
raw_news_article = {}

for filename in filelist:
    with open(filename , "r") as f:
        raw_news_article[os.path.split(filename)[1]] = f.read()

raw_news_article["1_Lee Dae-ho Announces MLB Aspirations.txt"]

'Lee Dae-ho, a Korean slugger currently playing with the SoftBank Hawks in Japan, has announced plans to pursue a career in Major League Baseball in the U.S.\n\nLee has earned recognition in Nippon Professional Baseball and was named MVP of the Japan Series this year. \n\n"I\'ve achieved my goals for this year. One was helping my team win the Japan Series and the other was becoming MVP," Lee said at a press event on Tuesday after returning home last weekend. "It is now time to pursue my childhood dream of playing in the MLB." \n\nMeanwhile, another Korean player, Park Byung-ho of the Nexen Heroes, is waiting for bids from MLB teams after posting was made early this week.\n\nExpectations for the two players are high after Kang Jung-ho successfully transferred from the Nexen Heroes to the Pittsburgh Pirates last year.'

In [181]:
# Rreprocessing


#1 To make lowercase word
for key, value in raw_news_article.items():
    raw_news_article[key] = value.lower()

In [182]:
import nltk
import string
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer

#2 Tokenizing and stemming functions

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

#3 Make corpus list 
news_corpus = set()
for value in raw_news_article.values():
    news_corpus = news_corpus |  set(tokenize(value))

len(news_corpus)

3406

In [183]:
#4 Removing stopwords 

with open("stopwords.txt", "r") as f:
    raw_stopwords = f.read()
    
stopwords_corpus = tokenize(raw_stopwords)

def imporved_tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    tokens = [i for i in tokens if i not in stopwords_corpus]
    stems = stem_tokens(tokens, stemmer)
    return stems

#3 Make corpus list 
news_corpus = set()
for value in raw_news_article.values():
    news_corpus = news_corpus |  set(imporved_tokenize(value))

len(news_corpus)

3211

In [184]:
# Make Corpus dict

news_corpus_dict = {word : index for index, word in enumerate(news_corpus)}
news_corpus_dict

{'gylfi': 0,
 'standpoint': 1,
 'qualiti': 2,
 'fox': 3,
 'lesen': 4,
 'we’d': 5,
 'scream': 6,
 'profession': 7,
 'leverkusen': 8,
 'jacqu': 14,
 'penultim': 12,
 'struggl': 15,
 'session': 16,
 '10': 17,
 'pretti': 19,
 'leav': 20,
 'bobadilla': 22,
 'system': 532,
 'carri': 23,
 'entir': 24,
 'opposit': 25,
 '1977.': 26,
 'goalscor': 27,
 'grate': 2685,
 'combin': 11,
 'confus': 28,
 'road': 32,
 'whi': 31,
 'corner': 36,
 'read': 33,
 'task': 37,
 'repair': 1105,
 'dive': 38,
 'spectators’': 13,
 'landor': 42,
 '2-0.': 41,
 'format': 541,
 'handl': 43,
 'ask': 44,
 'fascin': 45,
 'aaa': 46,
 'site': 10,
 'earlier': 48,
 'batter’': 49,
 'auf': 50,
 '“ani': 51,
 'mousa': 2126,
 'ixzz48qovyrym': 52,
 'learn': 54,
 'kicker': 55,
 'respectively.so': 59,
 'everi': 58,
 'birth': 60,
 '“tottenham': 61,
 'era': 62,
 'be': 63,
 'box.”': 65,
 'rise': 18,
 'lion': 66,
 'homerun': 67,
 'solut': 68,
 'lament': 70,
 'entertain': 71,
 'concuss': 72,
 'easier': 73,
 '3': 2674,
 'today': 545,
 'norm

In [186]:
# Make zero vectors for news articles

news_vector_dict = {news_name : [0 for i in range(len(news_corpus))]for news_name in raw_news_article}

# for key, value in raw_news_article.items():
#     for word in tokenize(value):
#         news_vector_dict[key][news_corpus_dict[word]] += 1    

for key, value in raw_news_article.items():
    for word in imporved_tokenize(value):
        news_vector_dict[key][news_corpus_dict[word]] += 1    
        
news_vector_dict["1_Lee Dae-ho Announces MLB Aspirations.txt"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [187]:
def get_normalized_vector(vector, mean, std):
    return [ (value  -  mean) / std for value in vector]

raw_word_matrix = [vector for vector in news_vector_dict.values()]

total_length = len(raw_word_matrix) * len(raw_word_matrix[1])  
mean = sum([sum(vector) for vector in news_vector_dict.values()]) / total_length

temp_vector = []
for vector in [[(value - mean) ** 2 for value in vector] for vector in news_vector_dict.values()]:
    for value in vector:
        temp_vector.append(value)
std = sum(temp_vector) / total_length

for key, value in news_vector_dict.items():
    news_vector_dict[key] = get_normalized_vector(value, mean, std)

In [188]:
def vectro_sqaure_root(vector_a, vector_b):
    import math
    return math.sqrt(sum([(a - b) ** 2 for a,b in zip(vector_a, vector_b)]))

vector_a = news_vector_dict["1_Lee Dae-ho Announces MLB Aspirations.txt"]
vector_b = news_vector_dict["1_Dae-Ho Lee walk-off homer gives Mariners 4-2 win over Rangers.txt"]

vectro_sqaure_root(vector_a,vector_b)

189.84687105409367

In [189]:
news_article_number = {index:key for index, key in enumerate(raw_news_article.keys())}
news_article_number

news_distance_matrix = []

for source_index in news_article_number.keys():
    news_distance_matrix.append([])
    for target_index in news_article_number.keys():
        source_vector = news_vector_dict[news_article_number[source_index]]
        target_vector = news_vector_dict[news_article_number[target_index]]
        
        news_distance_matrix[source_index].append(vectro_sqaure_root(source_vector,target_vector))

In [201]:
def get_matrix_index(news_name):
    for key, value in news_article_number.items(): 
        if value == news_name:
            return key  
        
hit_rate = { i:[0,0] for i in range(1,9) }
        
hit_rate
for news_name in raw_news_article.keys():
    taraget_matrix_index = get_matrix_index(news_name)
    target_matrix_dict = {index: value for index, value in enumerate(news_distance_matrix[taraget_matrix_index])}
    
    hit_rate[int(news_name[0])]
    
    from collections import OrderedDict
    matrix_sorted_by_value = OrderedDict(sorted(target_matrix_dict.items(), key=lambda x: x[1])[1:5])

    for keys in matrix_sorted_by_value.keys():
        if news_name[0] == news_article_number[keys][0]:
            hit_rate[int(news_name[0])][0] += 1
        else:
            hit_rate[int(news_name[0])][1] += 1

true_case = 0
false_case = 0

for value in hit_rate.values():
    true_case += value[0]
    false_case += value[1]    
    
print("hit ratio:", true_case / (true_case+false_case))

hit ratio: 0.5125
