In [4]:
# Import packages 
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
import re
import pickle
import collections

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords 

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

from sklearn.metrics.pairwise import cosine_similarity
from sklearn import cluster
from scipy.spatial import distance
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import gc

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [5]:
nltk.download('stopwords')
sw = stopwords.words('dutch') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Beaudine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Import data 
# This should be a dataframe consisting of data from a single survey, including the columns for question text 
# (named: QuestionText) and open answers (named: AnswerText)

df = pd.read_excel('data.xlsx')

In [10]:
# For illustration: a dataframe consisting of data for a single survey
print(list(set(df['SurveyID'])))
df = df.loc[df['SurveyID'] == 1]

[1]


In [13]:
# The dataframe consists of the three columns: SurveyID, AnswerText, QuestionText
df = df[['SurveyID', 'AnswerText', 'QuestionText']]

### Load models 

In [1]:
#-------------------------------------------- LOAD MODELS------------------------------------------------------------------
# Load embeddings and initialize vocab
model = Word2Vec.load('embeddings.bin')
vocab = [line.strip().split()[0] for line in model.wv.key_to_index]

# load embedding dictionary
f = open("embedding_dict.pkl", "rb")
embedding_dict = pickle.load(f)
f.close()

# load TF-IDF model 
with open('tfidf.mod', 'rb') as f:
    tfidf_model = pickle.load(f)
    
feature_names = tfidf_model.get_feature_names()


### Define functions

In [8]:
def get_tfidf_for_words(text):
    tfidf_matrix= tfidf_model.transform([text]).todense()
    feature_index = tfidf_matrix[0,:].nonzero()[1]
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    return dict(tfidf_scores)


In [9]:
def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>=`./?@#$%+^&*_~''') -> str:
    """
    A method to clean text 
    """
    # Cleaning the urls
    # string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Cleaning the html elements
    # string = re.sub(r'<.*?>', '', string)

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Removing numbers 
    string = re.sub('[0-9]+', '', string)

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    # string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string

In [10]:

def cosine_dist(sentence1, sentence2):

    # tokenization
    
    X= sentence1
    Y = sentence2

    X_list = word_tokenize(X) 

    Y_list = word_tokenize(Y)

    l1 = []; l2 = []

  
    # remove stop words from line

    X_set = {w for w in X_list if not w in sw} 

    Y_set = {w for w in Y_list if not w in sw}
    
    # If any of them is empty, return 0
    
    if len(X_set) == 0 or len(Y_set) == 0:
        return 0
    else:

    # generate a set containing the keywords of both lines

 
        rvector = X_set.union (Y_set) 

        vector = X_set.union(Y_set) 
        for w in rvector:
            if w in X_set: l1.append(1) # create a vector
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0

    # cosine formula 
        for i in range(len(rvector)):
                c+= l1[i]*l2[i]
        cosine = c / float((sum(l1)*sum(l2))**0.5)
    
        return cosine

In [11]:
connector_list = ["en", "zo", "zij", "hij", "zowel", "zoals", "waarmee", "zodat", "ook", "vervolgens", "maar", "echter", "toch", "daarentegen", "want", "omdat", \
                  "daarnaast", "verder", "daarom", "ook", "mits", "tenzij", "waardoor", "daardoor", "zodoende", "dus", "vandaar", "daarmee"]

false_ending_list = ["bijv", "bv", "etc", "mi"]

### Create summary algorithm 

In [13]:
def create_summary(selection, summary_df):    
    
    summaries_selected = df.loc[df['QuestionText'] == selection]
    
    df_summary = summaries_selected['AnswerText']
            
    questions = summaries_selected['QuestionText']

    question = ''
    for count, i in enumerate(questions):
        if count == 0:
            question += i
        
    # Clean 
    texts = []
    for i in tqdm(df_summary):
        texts.append(str(i))
        
    sentences = []
    for i in texts:
        i = sent_tokenize(i)
        for j in i:
            sentences.append(j)
    del texts 
    gc.collect()
    sentences_cleaned = []
    for sent in tqdm(sentences): 
        sent = clean_text(sent)
        sentences_cleaned.append(sent)
        
    small_sentences = []
    for sentence in tqdm(sentences_cleaned):
        sentence_dict = get_tfidf_for_words(sentence)
        new_sentence = ""
        for word in sentence.split():
            if word in feature_names and sentence_dict[word] >0.3:
                new_sentence = new_sentence + word
                new_sentence = new_sentence + " "
        small_sentences.append(new_sentence)
        
    new_old_dict = dict(zip(small_sentences, sentences_cleaned))
    
    sentence_vectors = []
    for i in tqdm(small_sentences):
        if len(i) > 0:
            v = sum([embedding_dict.get(w, np.zeros((100,))) for w in i.split() if w in vocab])/((len(i.split())+0.001))
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
        
    sentence_vectors_cleaned = []
    for i in tqdm(sentence_vectors):
        if type(i) == float:
            i = np.zeros((100,))
        sentence_vectors_cleaned.append(i)
        
    sentence_vectors = sentence_vectors_cleaned
    del sentence_vectors_cleaned 
    gc.collect()
    
    sentence_vectors_dictionary = {}
    for i in tqdm(small_sentences):
        if len(i) != 0:
            sentence_vectors_dictionary[i] = sum([embedding_dict.get(w, np.zeros((100,))) for w in i.split() if w in vocab])/((len(i.split())+0.001))
        else: 
            sentence_vectors_dictionary[i] = np.zeros((100,))
        
    pca = PCA(2)
    sentence_vectors_pca = pca.fit_transform(sentence_vectors)
    k = 0
    if len(sentence_vectors_pca) > 500:
        k = 10 
    elif len(sentence_vectors_pca) > 100 and len(sentence_vectors_pca) <= 500:
        k = 8 
    else: 
        k = 6 
        
    kmeans_pca = KMeans(n_clusters=k, random_state=0)
    label_pca = kmeans_pca.fit_predict(sentence_vectors_pca)
    centroids_pca = kmeans_pca.cluster_centers_
    
    labels_df = pd.DataFrame(label_pca)
    vectors = []
    for i in sentence_vectors_pca:
        vectors.append(i)
    labels_df['vectors'] = vectors

    datapoint = []
    for vector in tqdm(sentence_vectors):
        count = 0
        for key, item in sentence_vectors_dictionary.items():
            if np.array_equiv(vector, item) == True and count == 0:
                datapoint.append(key)
                count += 1

    new_datapoints = []
    for i in datapoint:
        new_datapoint = new_old_dict[i]
        new_datapoints.append(new_datapoint)
    
    labels_df['sentence'] = new_datapoints

    labels_df.rename(columns = {0:'label'}, inplace = True)
    
    tfidf_scores = []
    for sentence in labels_df['sentence']:
        summed = 0
        for key, value in get_tfidf_for_words(sentence).items():
            summed += value
        tfidf_scores.append(summed)
        
    labels_df['tfidf'] = tfidf_scores
    
    length_list = []
    for sentence in tqdm(labels_df['sentence']):
        length = 0
        for word in sentence.split():
            length += 1 
        length_list.append(length)
    
    labels_df['length'] = length_list

    cosine_list = []
    for index, sentence1 in tqdm(enumerate(labels_df["sentence"])):
        cosine_sum = 0
        label = labels_df['label'][index]
        for sentence2 in labels_df.loc[labels_df['label'] == label]['sentence']:
            cosine_sum += cosine_dist(sentence1, sentence2)
        cosine_list.append(cosine_sum)
        
    labels_df['cosine'] = cosine_list
    
    normalize_labels_df = labels_df[['tfidf', 'length', 'cosine']]

    normalized_pca=(normalize_labels_df-normalize_labels_df.min())/(normalize_labels_df.max()-normalize_labels_df.min())
    
    normalized_pca['sum'] = normalized_pca.sum(axis=1)
    
    labels_df['score'] = normalized_pca['sum']
    labels_df = labels_df[['label', 'sentence', 'score']]
    
    
    df_pca_grouped =  labels_df.groupby("label")
    max_df_pca = df_pca_grouped.max()
    max_df_pca = max_df_pca.reset_index()


    label_count=collections.Counter(label_pca)

# Create some dictionaries that can help convert quickly (for example when we have the label and want the sentence,
# or when we have the sentence but need the cluster count)
        
    label_sent_dict = {}    
    for label, sentence in enumerate(max_df_pca['sentence']):
        label_sent_dict[label] = sentence

    sent_count_dict = {}
    for label, sentence in label_sent_dict.items():
        count = label_count[label]
        sent_count_dict[sentence] = count

    count_sent_dict = {}
    for label, sentence in label_sent_dict.items():
        count = label_count[label]
        count_sent_dict[count] = sentence

    count_list = []
    for value in sent_count_dict.values():
        count_list.append(value)
    
    total_count = sum(count_list)

    count_list.sort(reverse=True)

# Create a sorted list of selected sentences for summary generation 

    ordered_dict = collections.OrderedDict(sorted(count_sent_dict.items(), reverse=True))

    sorted_sentences = []
    for count, sent in ordered_dict.items():
        sorted_sentences.append(sent)

    
# In case you want to only use large clusters in the summary, use this list instead of the sorted sentences list. 
# This list is now set at a frequency higher than 5%, but this can be adjusted where necessary 
    sorted_sentences_high_frequency = []
    for sentence in sorted_sentences:
        if sent_count_dict[sentence]*100/total_count >= 1:
            sorted_sentences_high_frequency.append(sentence)

    
    summary = []

    for iteration, summary_sentence in enumerate(sorted_sentences):
        count = 0
        for index, original_sentence in enumerate(sentences):
            if summary_sentence == clean_text(original_sentence) and count == 0:
                percentage = round((sent_count_dict[summary_sentence]*100)/total_count, 1)
                if iteration == 0:
                    intro = "To the question: \"" + question + "\" the following grouped answers can be found."
                    to_add = "The largest amount of respondents, " + str(percentage) + "%, say the following:"
                    summary.append(intro)
                    #summary.append("\n")
                    summary.append(to_add)
                elif percentage >= 5:
                    to_add = str(percentage) + "% of the answers say the following:"
                    summary.append(to_add)
                else:
                    summary.append("Less than 5%) of the answers concern the following: ")
                words = clean_text(original_sentence).split()
                if words[0] in connector_list:
                    summary.append(sentences[index-1])
                    summary.append(original_sentence)
                    #summary.append("\n")
                    count += 1
                elif words[-1] in false_ending_list:
                    summary.append(original_sentence + " " + sentences[index+1])
                    #summary.append("\n")
                    count += 1
                else:
                    summary.append(original_sentence)
                    #summary.append("\n")
                    count += 1
    
    final_summary = ''
    for i in summary:
        final_summary += i
        final_summary += "\n"
        
            
    return final_summary
            

In [4]:
print(create_summary('text', df))