In [1]:
import os
import timeit
import re
import unicodedata
import nltk
import numpy as np
import pandas as pd
#conda install -c conda-forge spacy
#python -m spacy download el_core_news_sm
import spacy
import el_core_news_lg
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import matplotlib.pyplot as plt

import warnings
import random

warnings.filterwarnings('ignore')

#nltk.download('stopwords')

nlp = el_core_news_lg.load()
tokenizer = ToktokTokenizer()

#Nltk Stopwords
#stopwords = nltk.corpus.stopwords.words('greek')


#Custom Stopwords
stopwords = []
with open('stopwords_gr.txt',mode = "r",encoding='utf-8') as f:
    stopwords = f.read().splitlines()


custom_stopwords = ['λυση','κατερινας','nikou','νικο','νικου','νικος','κατερινα','κατερίνα','ελληνικη','λεξη','ελληνικες','λεξει','katerina','λεξικο', 'λιστα', 'λεξικου','katerina','katerinas', 'ειναι','είναι']

stopwords = stopwords + custom_stopwords

# get bag of words features in sparse format
#cv = CountVectorizer(min_df=0., max_df=20.,ngram_range=(1,2))
#cv = CountVectorizer(ngram_range=(3,3))




In [2]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

In [3]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [4]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-zα-ωΑ-Ωάέίόώήύ0-9\s]' if not remove_digits else r'[^a-zA-zα-ωΑ-Ωάέίόώήύ\s]'
    text = re.sub(pattern, '', text)
    return text

In [5]:
def remove_stopwords(text):
    
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopwords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [6]:
def normalize_mooc(corpus, accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accents(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # remove words with up to 2 letters
        doc = re.sub(r'\b\w{1,2}\b', ' ', doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        
        doc = re.sub(r'\]',' ', doc)
        doc = re.sub(r'\[',' ', doc)
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)

        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc)
        
      #  print(doc)
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [7]:
def print_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % (topic_idx+1))
        print(", ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

### 1. Get Only Final Answers

In [20]:


mooc_list = ['SPOC Chat Log.xlsx']
#mooc_list = ['MOOC Chat Log.xlsx']


answers = []
for m_file in mooc_list:
    mooc = pd.read_excel(m_file,
    header=0,
    index_col=False,
    keep_default_na=True
    )
    
    for index, row in mooc.iterrows():      
        if(row['message_type'] == 0 and int(row['message_length']) > 0):
            answers.append(row['the_message'])  


print("Final Answers found: ",len(answers))
random.shuffle(answers)

Final Answers found:  106


### 2. Normilize Corpus

#### Pre-processing Steps

1. Remove Accents
2. Turn all to lower case
3. Lemmatize text
4. Remove special characters and empty lines
5. Remove numbers
6. Remove stopwords

In [9]:
normalized = normalize_mooc(answers)
#print(normalized)

## LDA & NMF

Firstly we need to generate a document-term matrix with a bag-of-word model.
We use CountVectorizer for LDA and TfidfVectorizer for NMF.

In [10]:
#based on https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html

from time import time

t0 = time()
n_samples = 2000
n_features = 1000
n_components_a = 5
n_top_words = 20
n_iterations = 800

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   #max_features=n_features,
                                   ngram_range=(2, 2)
                                )
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(normalized)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(min_df=2, max_df=0.95,ngram_range=(2, 2))
t0 = time()
tf = tf_vectorizer.fit_transform(normalized)
print("done in %0.3fs." % (time() - t0))
print()


# Fit the LDA model
print('\n' * 2, "Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components_a, max_iter=n_iterations,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0,
                               n_jobs = -1)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

#tf_feature_names = tf_vectorizer.get_feature_names()
#plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')
print_topics(lda, tf_vectorizer, 5)

##### ####

# Fit the NMF model
print('\n' * 2, "Fitting the NMF model (generalized Kullback-Leibler "
      "divergence) with tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components_a, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=n_iterations, alpha=.01,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

#tfidf_feature_names = tfidf_vectorizer.get_feature_names()
#plot_top_words(nmf, tfidf_feature_names, n_top_words, 'Topics in NMF model (generalized Kullback-Leibler divergence)')
print_topics(nmf, tfidf_vectorizer, 5)



Extracting tf-idf features for NMF...
done in 0.026s.
Extracting tf features for LDA...
done in 0.007s.



 Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 10.307s.

Topic #1:
μεταφρασει αγγλικα, αγγλικων λεξεο, εκτελεση προγραμματο, αγγλικα πλεονεκτημο, ταχυτηα εκτελεση

Topic #2:
χρονο εκτελεση, χρηση λεξικων, αντιστοιχη τιμη, τιμη κλειδιο, αυξανομαι πληθο

Topic #3:
κλειδι τιμες, τιμη αγγλικη, χρονο αναζητηση, αντιστοιχες αγγλικα, αντιστοιχες αγγλικες

Topic #4:
αντιστοιχη αγγλικη, χωρο μνημη, αντιστοιχες αγγλικες, χρονο εκτελεση, τιμες αντιστοιχες

Topic #5:
χρονο χρειαζομαι, κλειδια τιμες, χρειαζομαι υπολογιστης, περισσοτερος χρονο, τιμη κλειδιο


 Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.185s.

Topic #1:
αντιστοιχες αγγλικες, χρονο εκτελεση, τιμες αντιστοιχες, αντιστοιχες αγγλικα, υπαρχω κλειδι

Topic #2:
χωρο μνημη, λιγοτερος χρονο, μνημη υπολογιστη, χρ

In [11]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()


# SeaNMF

## Εκτέλεση SeaNMF

GitHub Repository https://github.com/tshi04/SeaNMF

Μόλις δημιουργήσουμε το αρχείο με τα προ-επεξεργασμένα έγγραφα, θα χρησιμοποιήσουμε αυτές τις εντολές στη γραμμή εντολών (CMD)
1. python3 data_process.py
2. python3 train.py --n_topics 10
3. python3 vis_topic.py

In [12]:
#SeaNMF bigram
docs = []
for i in normalized:
    bigrams = nltk.ngrams(i.split(), 2)
    w = []
    for grams in bigrams:
        string_bigram = '_'.join(grams)
        w.append(string_bigram)
    doc = ' '.join(w)
    docs.append(doc)
#print(docs)

In [13]:
with open('normalized_documents.txt', 'w', encoding='utf-8') as f:
    for line in docs:
        print(line, file=f)
        

# GSDMM

GitHub Repository: https://github.com/rwalk/gsdmm

In [14]:
# Uses the bigram docs generated in seanmf
bigrams = []
for doc in docs:
    listt = doc.split(" ")
    bigrams.append(listt)
#print(bigrams)

In [16]:

from gsdmm import MovieGroupProcess

vocab = set(x for doc in bigrams for x in doc)
#print(vocab)
n_terms = len(vocab)
print("Voc size:", n_terms)
print("Number of documents:", len(bigrams))

mgp = MovieGroupProcess(K=5, alpha=0.1, beta=0.1, n_iters=100)

vocab = set(x for doc in bigrams for x in doc)
n_terms = len(vocab)
n_docs = len(bigrams)

# Fit the model on the data given the chosen seeds
y = mgp.fit(bigrams, n_terms)

Voc size: 3014
Number of documents: 106
In stage 0: transferred 69 clusters with 5 clusters populated
In stage 1: transferred 27 clusters with 5 clusters populated
In stage 2: transferred 23 clusters with 5 clusters populated
In stage 3: transferred 27 clusters with 5 clusters populated
In stage 4: transferred 25 clusters with 5 clusters populated
In stage 5: transferred 18 clusters with 5 clusters populated
In stage 6: transferred 17 clusters with 5 clusters populated
In stage 7: transferred 15 clusters with 5 clusters populated
In stage 8: transferred 14 clusters with 5 clusters populated
In stage 9: transferred 20 clusters with 5 clusters populated
In stage 10: transferred 14 clusters with 5 clusters populated
In stage 11: transferred 19 clusters with 5 clusters populated
In stage 12: transferred 30 clusters with 5 clusters populated
In stage 13: transferred 22 clusters with 5 clusters populated
In stage 14: transferred 20 clusters with 5 clusters populated
In stage 15: transferred 

In [18]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Topic %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — — ')
        print()


In [19]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topics :', doc_count)
print('*'*20)

# Topics sorted by document inside
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)


# Show the top 5 words by cluster, it helps to make the topic_dict below
top_words(mgp.cluster_word_distribution, top_index, 5)

Number of documents per topics : [24 19 27 17 19]
********************
Most important clusters (by number of docs inside): [2 0 4 1 3]
********************
Topic 2 : [('αντιστοιχη_αγγλικη', 5), ('χρονο_χρειαζομαι', 4), ('εκτελεση_προγραμματο', 3), ('κλειδια_τιμες', 3), ('τιμες_αντιστοιχη', 3)]
 — — — — — — — — — 

Topic 0 : [('αντιστοιχες_αγγλικες', 8), ('αντιστοιχη_αγγλικη', 5), ('συνδυασμο_λιστας', 5), ('τιμες_αντιστοιχες', 5), ('κλειδιος_τιμες', 4)]
 — — — — — — — — — 

Topic 4 : [('αντιστοιχη_τιμη', 4), ('αγγλικα_πλεονεκτημο', 3), ('αγγλικες_τιμες', 2), ('ελληνικος_λεξεο', 2), ('βρισκω_αντιστοιχη', 2)]
 — — — — — — — — — 

Topic 1 : [('χωρο_μνημη', 9), ('λιγοτερος_χρονο', 6), ('χρονο_εκτελεση', 4), ('μνημη_υπολογιστη', 3), ('λιγοτερος_χωρο', 3)]
 — — — — — — — — — 

Topic 3 : [('χρονο_εντοπισμο', 5), ('χρονο_προσπελαση', 5), ('αυξανομαι_χρονο', 5), ('αντιστοιχες_αγγλικες', 4), ('χρηση_λιστας', 4)]
 — — — — — — — — — 

