In [None]:
import string
import numpy as np
import pandas as pd
import pyLDAvis.gensim
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases, TfidfModel
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from scipy.spatial import distance
from gensim.test.utils import datapath
from gensim.models import hdpmodel

In [None]:
Month_days = {1 : 31, 2 : 29, 3 : 31, 4 : 30, 5 : 31, 6 : 30,
             7 : 31, 8 : 31, 9 : 30, 10 : 31, 11 : 30, 12 : 31}

In [None]:
def get_articles(start_month = 1, end_month = None, date_specified = False, start_date = 1, end_date = 31):
    articles = []
    for l in range(start_month,end_month+1):
        if(date_specified is False):
            end_date = Month_days[l]
        for i in range(start_date, end_date+1):
            dataP = pd.read_csv(str(l) + "_" + str(i) + "_P.csv")
            dataTH = pd.read_csv(str(l) + "_" + str(i) + "_TH.csv")
            dataTOI = pd.read_csv(str(l) + "_" + str(i) + "_TOI.csv")
            for j in range(len(dataP)):
                articles.append(dataP['Article'][j])
            for j in range(len(dataTH)):
                articles.append(dataTH['Content'][j])
            for j in range(len(dataTOI)):
                articles.append(dataTOI['Content'][j]) 
    return articles

In [None]:
def standardize_data(articles):
    temp = []
    for article in articles:
        if type(article) is str:
            temp.append(article.lower().translate(str.maketrans('','' , string.punctuation)))
    for i in range(len(temp)):
        temp[i] = " ".join(temp[i].split())
    return temp

In [None]:
def word_preprocessing(articles):
    words = []
    for article in articles:
        words.append(word_tokenize(article))
    stop_words = set(stopwords.words("english"))

    for i in range(len(words)):
        words[i] = [word for word in words[i] if word not in stop_words]  

    for i in range(len(words)):
        words[i] = [token for token in words[i] if not token.isnumeric()] 

    lemmatizer = WordNetLemmatizer()
    for i in range(len(words)):
        words[i] = [lemmatizer.lemmatize(j) for j in words[i]]

    for i in range(len(words)): 
        words[i] = [token for token in words[i] if len(token) > 1]
    
    return words

In [None]:
articles = get_articles(1, 3)
articles = standardize_data(articles)
words = word_preprocessing(articles)

In [None]:
len(words)

In [None]:
def LDA_preprocessing(words):
    bigram = Phrases(words, min_count=5,threshold= 100)
    for idx in range(len(words)):
        for token in bigram[words[idx]]:
            if '_' in token:
                words[idx].append(token)    
    dictionary = Dictionary(words)
    dictionary.filter_extremes(no_below = 5000 ,no_above = 0.3, keep_n = 100000)
    corpus = [dictionary.doc2bow(doc) for doc in words]
    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    return dictionary, corpus_tfidf, words

In [None]:
dictionary, corpus_tfidf, words = LDA_preprocessing(words)

In [None]:
num_topics = 6
chunksize = 5000
passes = 20
iterations = 400
eval_every = None

In [None]:
model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, chunksize=chunksize, 
                 alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics,
                 passes=passes, eval_every=eval_every )

In [None]:
temp_file = datapath("model2")
model.save((temp_file))

model2 = LdaModel.load(temp_file)

In [None]:
lda_display = pyLDAvis.gensim.prepare(model, corpus_tfidf, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
all_topics = model.get_document_topics(corpus_tfidf, per_word_topics=False)

In [None]:
for i in range(0,10):
    print('New Document \n')
    print('Document topics:', all_topics[i])
    print(" ")
    print('-------------- \n')

distance.jensenshannon(doc1,doc2)

In [None]:
print(temp_file)
#model_mar = LdaModel.load(temp_file)

In [None]:
from gensim.models import hdpmodel
hdp_model = hdpmodel.HdpModel(corpus=corpus_tfidf, id2word=id2word)
hdp_model.print_topics(num_topics=-1)

shown_topics = hdp_model.show_topics(num_topics=hdp_model.m_T, formatted=False)
topics_nos = [x[0] for x in shown_topics]
weights = [ sum([item[1] for item in shown_topics[topicN][1]]) for topicN in topics_nos ]
ll= pd.DataFrame({'topic_id' : topics_nos, 'weight' : weights})

for i in range(0,150):
    print(ll['weight'][i])
    print('\n')

In [None]:
train_vecs = []
for i in range(len(words)):
    top_topics = model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(6)]
    train_vecs.append(topic_vec)

In [None]:
print(train_vecs[0])

In [None]:
test_articles = get_articles(4,4,date_specified = True, start_date = 1, end_date = 1)

test_articles = standardize_data(test_articles)
test_words = word_preprocessing(test_articles)

testcorpus = [dictionary.doc2bow(doc) for doc in test_words]

test_vecs = []
for i in range(len(test)):
    top_topics = model.get_document_topics(testcorpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(6)]
    test_vecs.append(topic_vec)

In [None]:
topic0 = []
topic1 = []
topic2 = []
topic3 = []
topic4 = []
topic5 = []
matched_topic = [topic0, topic1, topic2, topic3, topic4, topic5]


In [None]:
for i in range(len(test_vecs)):
    mx = -1
    for j in range(len(test_vecs[i])):
        mx = max(mx, test_vecs[i][j])
    
    for j in range(len(test_vecs[i])):
        if mx==test_vecs[i][j]:
            matched_topic[j].append(test_articles[i])
            break

In [None]:
print((matched_topic[0][0]))

import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd


def convert_tag(tag):
    """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""

    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None


def doc_to_synsets(doc):
    """
    Returns a list of synsets in document.

    Tokenizes and tags the words in the document doc.
    Then finds the first synset for each word/tag combination.
    If a synset is not found for that combination it is skipped.

    Args:
        doc: string to be converted

    Returns:
        list of synsets

    Example:
        doc_to_synsets('Fish are nvqjp friends.')
        Out: [Synset('fish.n.01'), Synset('be.v.01'), Synset('friend.n.01')]
    """

    # Your Code Here
    token = nltk.word_tokenize(doc)
    # add parts of speech to token
    tag = nltk.pos_tag(token)
    # convert nltk pos into wordnet pos
    nltk2wordnet = [(i[0], convert_tag(i[1])) for i in tag]
    # if there are no synsets in token, ignore, else put in a list
    output = [wn.synsets(i, z)[0] for i, z in nltk2wordnet if len(wn.synsets(i, z))>0]

    return output


def similarity_score(s1, s2):
    """
    Calculate the normalized similarity score of s1 onto s2

    For each synset in s1, finds the synset in s2 with the largest similarity value.
    Sum of all of the largest similarity values and normalize this value by dividing it by the
    number of largest similarity values found.

    Args:
        s1, s2: list of synsets from doc_to_synsets

    Returns:
        normalized similarity score of s1 onto s2

    Example:
        synsets1 = doc_to_synsets('I like cats')
        synsets2 = doc_to_synsets('I like dogs')
        similarity_score(synsets1, synsets2)
        Out: 0.73333333333333339
    """


    # Your Code Here
    list1 = []
    # For each synset in s1
    for a in s1:
        # finds the synset in s2 with the largest similarity value
        list1.append(max([i.path_similarity(a) for i in s2 if i.path_similarity(a) is not None]))

    output = sum(list1)/len(list1)

    return output


def document_path_similarity(doc1, doc2):
    """Finds the symmetrical similarity between doc1 and doc2"""
            # first function u need to create
    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)
            # 2nd function u need to create
    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

t = np.zeros((len(matched_topic[0]),len(matched_topic[0])))
t             

for i in range(len(matched_topic[0])):
              doc1 = test_articles[matched_topic[0][i]]
              for j in range(i, len(matched_topic[0])):
                  doc2 = test_articles[matched_topic[0][j]]
                  if i == j:
                      t[i][j]= -1
                      continue
                  t[i][j] = document_path_similarity(doc1, doc2)
                  t[j][i] = t[i][j]
                  print(t[i][j], end= '')  

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Create the Document Term Matrix
count_vectorizer = TfidfVectorizer(stop_words='english')
count_vectorizer = TfidfVectorizer()
sparse_matrix = count_vectorizer.fit_transform(matched_topic[])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
a=[]
a= cosine_similarity( sparse_matrix, sparse_matrix)

In [None]:
a[100]