In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm.notebook as tq
import pandas as pd
import numpy as np
import pickle
import nltk
from gensim import corpora, matutils, models, similarities
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
import os
from nameparser.parser import HumanName
from nltk.stem import WordNetLemmatizer 

In [None]:
df = pd.read_csv('../data/all_posts.csv')
df.shape

In [None]:
def clean_text(df, text_field,manager,stadium):
    '''
    Clean all the text data within a certain text column of the dataFrame.
    '''
    df[text_field] = df[text_field].str.replace(r"http\S+", " ")
    df[text_field] = df[text_field].str.replace(r"&[a-z]{2,4};", "")
    df[text_field] = df[text_field].str.replace("\\n", " ")
    df[text_field] = df[text_field].str.replace(r"#f", "")
    df[text_field] = df[text_field].str.replace(r"[\’\'\`\":]", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9]", " ")
    df[text_field] = df[text_field].str.replace(r" +", " ")
    df[text_field] = df[text_field].str.lower()
    for word in manager:
        df[text_field] = df[text_field].replace((" "+word+" "), " managers ",regex=True)
    for word in stadium:
        df[text_field] = df[text_field].replace((" "+word+" "), " stadium ",regex=True)
    df[text_field] = df[text_field].str.replace(" nan", "",regex=True)
    

In [None]:
def preprocess(df):
    df.self_text = df.self_text.astype(str)
    df['text'] = df.title + ' ' + df.self_text
    manager_names = pickle.load(open('../data/pretrained/manager_names.pkl','rb'))
    stadium_names = pickle.load(open('../data/pretrained/stadium_names.pkl','rb'))
    clean_text(df, 'text',manager_names,stadium_names)

In [None]:
preprocess(df)

In [None]:
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
lemma = []
for text in tq.tqdm(df.text):
    word_list = nltk.word_tokenize(text)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    lemma.append(lemmatized_output)
df.text = lemma

In [None]:
player_nicknames = ['leo','cr7','chicharito','hulk','dave','rhino','rino']

In [None]:
player_names = pickle.load(open('../data/pretrained/player_names.pkl','rb'))

In [None]:
team_names = ['arsenal','gunners','milan','ac','acmilan','rossoneri','atlético','atletico','madrid','barcelona','barca',
             'borussia','dortmund','borussiadortmund','schwarzgelben','chelsea','munich','bayern','münchen','inter',
             'intermilan','nerazzurri','juve','juventus','liverpool','reds','manchester','mancity','city','mcfc','psg','paris',
             'saint-germain','saint','germain','real','united','manutd','utd','man','mufc','devels','red','blue','roma',
             'tottenham','spurs','spur','hotspur','hotspurs','aston', 'villa','brentford','beighton','burnley','cystal',
             'palace','everton','leeds','leicester','newcastle','norwich','southampton','swansea','watford','ham','wolverhampton','wanderers',
             'alavés','athletic','celta','vigo','elche','espanyol','getafe','granada','levante','mallorca','osasuna','vallecano',
             'betis','sociedad','sevilla','valencia','villarreal','atlanta','bologna','cagliari','empoli','fiorentina','genoa',
             'verona','internazionale','lazio','napoli','salernitana','sampdoria','sassuolo','spezia','torino','udinese','venezia',
             'arminia','bielefeld','leverkusen','bochum','mönchengladbach','eintracht','frankfurt','freiburg','hertha','hoffenhiem',
             'köln','leipzig','mainz','stuttgart','union','berlin','wolfsburg','lille','lyon','marseille','monaco','nantes','nice']

In [None]:
time_words = ['mon','tue','wed','thu','fri','sat','sun','jan','feb','mar','apr','may','jun','jul','aug','sep',
              'oct','nov','dec','monday','tuesday','wednesday','thursday','friday','saturday','sunday','january',
              'feburary','march','april','may','june','july','august','september','october','november','december',
              '2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020','2021','2022','2023',
              '2024','2025','2026','2027']

In [None]:
def stopwords_generation():
    stop = stopwords.words('english')
    stop.extend([x.replace("\'", "") for x in stop])
    stop.extend(['nbsp', 'also', 'really', 'ive', 'even', 'jon', 'lot', 'could', 'many','x200b','would','one','remove','removed','delete','deleted',
             'juventusnews24'])
    stop.extend(player_names)
    stop.extend(time_words)
    stop.extend(player_nicknames)
    stop.extend(team_names)
    stop = list(set(stop))
    return stop

In [None]:
def vector_generation():
    stop = stopwords_generation()
    cv = CountVectorizer(token_pattern='\\w{3,}', max_df=.30, min_df=.0001,
                     stop_words=stop, ngram_range=(1,1),lowercase=False,dtype='uint8')

    tfidf = TfidfVectorizer(token_pattern='\\w{3,}', max_df=.30, min_df=.0001, 
                        stop_words=stop, ngram_range=(1,1), lowercase=False,
                        sublinear_tf=True, smooth_idf=False, dtype='float32')
    cv_vecs = cv.fit_transform(df.text).transpose()
    tf_vecs = tfidf.fit_transform(df.text).transpose()
    print("Sparse Shape:", cv_vecs.shape) 
    print('CV:', sys.getsizeof(cv_vecs))
    print('Tf-Idf:', sys.getsizeof(tf_vecs))
    pickle.dump(cv_vecs, open('../data/pretrained/cv_vecs_v4.pkl', 'wb'))
    pickle.dump(tf_vecs, open('../data/pretrained/tfidf_vecs_v4.pkl', 'wb'))
    return cv,tfidf,cv_vecs,tf_vecs

In [None]:
cv,tfidf,cv_vecs, tf_vecs = vector_generation()

In [None]:
def vector_info(cv_vecs,tf_vecs):
    
    tfidf_df = pd.DataFrame(tf_vecs.transpose().todense(), columns=[tfidf.get_feature_names()]).astype('float32')
    cv_df = pd.DataFrame(cv_vecs.transpose().todense(), columns=[cv.get_feature_names()]).astype('uint8')
    print(cv_df.info())
    print(tfidf_df.info())
    cv_description = cv_df.describe().T
    tfidf_description = tfidf_df.describe().T
    print(tfidf_df.sum().sort_values(ascending=False))

In [None]:
def Trunc_SVD(vectorized, n_components=500, iterations=1, normalize=False, random_state=42):
    """
    Performs LSA/LSI on a sparse document term matrix, returns a fitted, transformed, (normalized) LSA object
    """
    # Already own the vectorized data for LSA, just transpose it back to normal
    vecs_lsa = vectorized.T

    # Initialize SVD object as LSA
    lsa = TruncatedSVD(n_components=n_components, n_iter=iterations, algorithm='randomized', random_state=random_state)
    dtm_lsa = lsa.fit(vecs_lsa)
    print("Explained Variance - LSA {}:".format(n_components), dtm_lsa.explained_variance_ratio_.sum())
    if normalize:
        dtm_lsa_t = lsa.fit_transform(vecs_lsa)
        dtm_lsa_t = Normalizer(copy=False).fit_transform(dtm_lsa_t)
        return dtm_lsa, dtm_lsa_t
    return dtm_lsa


def plot_SVD(lsa, title, level=None):
    """
    Plots the singular values of an LSA object
    """
    plt.figure(num=1, figsize=(15,10))
    plt.suptitle(title, fontsize=22, x=.55, y=.45, horizontalalignment='left')
    plt.subplot(221)
    plt.title('Explained Variance by each Singular Value')
    plt.plot(lsa.explained_variance_[:level])
    
    plt.subplot(222)
    plt.title('Explained Variance Ratio by each Singular Value')
    plt.plot(lsa.explained_variance_ratio_[:level])
    
    plt.subplot(223)
    plt.title("Singular Values ('Components')")
    plt.plot(lsa.singular_values_[:level])
    plt.show()

In [None]:
cv_dtm_lsa = Trunc_SVD(cv_vecs,iterations=5)
plot_SVD(cv_dtm_lsa, title='Count Vectorizer', level=25)

tf_dtm_lsa = Trunc_SVD(tf_vecs, iterations=5)
plot_SVD(tf_dtm_lsa, title='Term Frequency - \nInverse Document Frequency', level=25)

In [None]:
print('SVD Value| CV | TFIDF')
print('Top 2:  ',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:2])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:2])),3))
print('Top 3:  ',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:3])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:3])),3))
print('Top 4:  ',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:4])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:4])),3))
print('Top 5:  ',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:5])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:5])),3))
print('Top 6:  ',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:6])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:6])),3))
print('Top 7:  ',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:7])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:7])),3))
print('Top 8:  ',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:8])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:8])),3))
print('Top 16:\t',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:16])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:16])),3))
print('Top 32:\t',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:32])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:32])),3))
print('Top 64:\t',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:64])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:64])),3))
print('Top 128:',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:128])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:128])),3))
print('Top 256:',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:256])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:256])),3))
print('Top 500:',round(sum(list(cv_dtm_lsa.explained_variance_ratio_[:500])),3),round(sum(list(tf_dtm_lsa.explained_variance_ratio_[:500])),3))

In [None]:
# Close look at the elbow plots
def elbow(dtm_lsa):
    evr = dtm_lsa.explained_variance_ratio_[:20]
    print("Explained Variance Ratio (EVR):\n", evr)
    print("Difference in EVR (start 3):\n", np.diff(evr[2:]))
    plt.figure()
    plt.plot(-np.diff(evr[2:]))
    plt.xticks(range(-1,22), range(2,25))
    plt.suptitle('Difference in Explained Variance Ratio', fontsize=15);
    plt.title('Start from 3, moves up to 20');

# Count Vectorizer
elbow(cv_dtm_lsa)


In [None]:
elbow(tf_dtm_lsa)

# LSA

In [None]:
cv_corpus = matutils.Sparse2Corpus(cv_vecs)
pickle.dump(cv_corpus, open('../data/pretrained/cv_corpus_v4.pkl','wb'))

In [None]:
id2word = dict((v,k) for k, v in cv.vocabulary_.items())
id2word = corpora.Dictionary.from_corpus(cv_corpus, id2word = id2word)
pickle.dump(id2word, open('../data/pretrained/id2word_v4.pkl','wb'))

In [None]:
tfidf_corpus = matutils.Sparse2Corpus(tf_vecs)
pickle.dump(tfidf_corpus, open('../data/pretrained/tf_corpus.pkl','wb'))

In [None]:
tf_id2word = dict((v,k) for k, v in tfidf.vocabulary_.items())
tf_id2word = corpora.Dictionary.from_corpus(tfidf_corpus, id2word = tf_id2word)
pickle.dump(tf_id2word, open('../data/pretrained/tf_id2word.pkl','wb'))

In [None]:
lsi = models.LsiModel(corpus = cv_corpus, id2word = id2word, num_topics=10)

In [None]:
lsi_corpus = lsi[cv_corpus]
doc_vecs = [doc for doc in lsi_corpus]

In [None]:
for i in range(10):
    print(lsi.print_topic(i, topn=10))

# Similarity Scoring

In [None]:
index = similarities.MatrixSimilarity(doc_vecs)
docu = 0
sims = sorted(enumerate(index[doc_vecs[docu]]),key=lambda item:-item[1])
np.r_[sims[:10],sims[-10:]]

In [None]:
for sim_doc_id, sim_score in sims[:11]: 
    print("\nScore:", sim_score)
    print("Document Text:\n", df.text[sim_doc_id])

In [None]:
lda = models.LdaMulticore(corpus=tq.tqdm(cv_corpus), num_topics=15, id2word=id2word, passes=85, 
                              workers=13, random_state=42, eval_every=None, chunksize=6000)