# The effect of preprocessing on short document clustering

In [None]:
import pandas, numpy, textblob, string
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, decomposition, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from gensim.models import word2vec, KeyedVectors
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.neighbors import KDTree

## Load data

In [None]:
def Amazon_load_data():
    data = open('corpus', encoding="utf8").read()
    labels, texts = [], []
    for i, line in enumerate(data.split("\n")):
        content = line.split()
        labels.append(content[0])
        texts.append(content[1:])

    # create a dataframe using texts and lables
    data = pandas.DataFrame()
    data['text'] = texts
    data['label'] = labels
    texts1=[' '.join(line) for line in texts] 
    data['text']=texts1
    data.loc[:,'label'].replace(['__label__1', '__label__2'], [0, 1], inplace=True)
    return data

In [None]:
def Yelp_load_data():
    data = pandas.read_csv('yelp.csv')
    data = data.drop('Unnamed: 0', axis=1)
    return data

In [None]:
def Dbpedia_load_data():
    data = pandas.read_csv('dbpedia.csv')
    data = data.drop('Unnamed: 0', axis=1)
    return data

### Quick info on datasets

In [None]:
# Amazon Dataset
df = Amazon_load_data()
print("Amazon data:")
print("Shape: ", df.shape)
count = df['text'].str.split().str.len()
print("Average number of words in documents", np.mean(count))

In [None]:
df.tail()

In [None]:
# obtain the number of unique words in the complete dataframe, by joining all documents, counting the occurences and taking the length
uniqueWords = list(set(" ".join(df['text']).lower().split(" ")))
uniqueWords_count = len(uniqueWords)
uniqueWords_count

In [None]:
5000 * 13.7794

In [None]:
# Yelp dataset
df = Yelp_load_data()
print("Yelp data:")
print("Shape: ", df.shape)
count = df['text'].str.split().str.len()
print("Average number of words in documents", np.mean(count))

In [None]:
df.tail()

In [None]:
# obtain the number of unique words in the complete dataframe, by joining all documents, counting the occurences and taking the length
uniqueWords = list(set(" ".join(df['text']).lower().split(" ")))
uniqueWords_count = len(uniqueWords)
uniqueWords_count

In [None]:
# DBpedia dataset
df = Dbpedia_load_data()
print("DBpedia data:")
print("Shape: ", df.shape)
count = df['text'].str.split().str.len()
print("Average number of words in documents", np.mean(count))

In [None]:
df.tail()

In [None]:
# obtain the number of unique words in the complete dataframe, by joining all documents, counting the occurences and taking the length
uniqueWords = list(set(" ".join(df['text']).lower().split(" ")))
uniqueWords_count = len(uniqueWords)
uniqueWords_count

## Text cleaning

In [None]:
def textcleaning(trainDF, lower=0, punctuation=0, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0):
    if (lower):
        # to Lowercase
        trainDF['text'] = trainDF['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    if (punctuation):
        # remove punctuation
        trainDF['text'] = trainDF['text'].str.replace('[^\w\s]','')
    if (commonwords):
        # remove common words
        freq = pandas.Series(' '.join(trainDF['text']).split()).value_counts()[:10]
        freq = list(freq.index)
        trainDF['text'] = trainDF['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    if (rarewords):
        # rare words removal
        freq = pandas.Series(' '.join(trainDF['text']).split()).value_counts()[-10:]
        freq = list(freq.index)
        trainDF['text'] = trainDF['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    if (stemming):
        # stemming
        st = PorterStemmer()
        trainDF['text'] = trainDF['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
    if (lemmatization):
        # lemmatization
        wordnet_lemmatizer = WordNetLemmatizer()
        trainDF['text'] = trainDF['text'].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word, pos="v") for word in x.split()]))
    if (stopword):
        # remove stopwords
        stop = stopwords.words('english')
        trainDF['text'] = trainDF['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    return trainDF

## Feature extraction methods

In [None]:
def wordtfidf(text):
    # word level tf-idf
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
    tfidf_vect.fit(text)
    text_tfidf =  tfidf_vect.transform(text)
    return text_tfidf

In [None]:
def ngramtfidf(text):
    # ngram level tf-idf 
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    tfidf_vect_ngram.fit(text)
    x_tfidf_ngram =  tfidf_vect_ngram.transform(text)
    return x_tfidf_ngram

In [None]:
def tokenizecorpus(x):
    wpt = nltk.WordPunctTokenizer()
    tokenized_corpus = [wpt.tokenize(document) for document in x]
    return tokenized_corpus

def emb_glove():
    glove_input_file = "glove.6B.300d.txt"
    word2vec_output_file = "word2vec.txt"
    glove2word2vec(glove_input_file, word2vec_output_file)
    glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    return glove_model

def emb_W2V(corpus):
    # Set values for various parameters
    feature_size = 10    # Word vector dimensionality  
    window_context = 10  # Context window size                                                                                    
    min_word_count = 1   # Minimum word count                        
    sample = 1e-3   # Downsample setting for frequent words
    tokenized_corpus = corpus
    w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                                  window=window_context, min_count = min_word_count,
                                  sample=sample, iter=100)
    return w2v_model

def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

## K-means and evaluation metrics

In [None]:
def k_means(x, y, k, repeat = 1):
    arr_a = list()
    arr_b = list()
    arr_c = list()
    arr_f = list()
    arr_g = list()
    arr_h = list()
    arr_i = list()
    arr_j = list()
    for i in range(repeat):
        km = KMeans(n_clusters=k)
        kmeans = km.fit(x)
        predictions = kmeans.labels_
        a = metrics.accuracy_score(predictions, y)
        b = metrics.silhouette_score(x, predictions)
        c = metrics.adjusted_rand_score(y, predictions)
        f = metrics.silhouette_score(x, y)
        g = metrics.adjusted_mutual_info_score(y, predictions, average_method='arithmetic')
        h = metrics.normalized_mutual_info_score(y, predictions, average_method='arithmetic')
        i = metrics.homogeneity_score(y, predictions)
        j = metrics.completeness_score(y, predictions)
        arr_a.append(a)
        arr_b.append(b)
        arr_c.append(c)
        arr_f.append(f)
        arr_g.append(g)
        arr_h.append(h)
        arr_i.append(i)
        arr_j.append(j)
    print(kmeans.cluster_centers_)        
    print("accuracy: \t", np.mean(arr_a), "std: ", np.std(arr_a))
    print("ASW: \t\t", np.mean(arr_b), "std: ", np.std(arr_b))
    print("ARI: \t\t", np.mean(arr_c), "std: ", np.std(arr_c))
    print("true ASW: \t", np.mean(arr_f), "std: ", np.std(arr_f))
    print("AMI: \t\t", np.mean(arr_g), "std: ", np.std(arr_g))
    print("NMI: \t\t", np.mean(arr_h), "std: ", np.std(arr_h))
    print("H: \t\t", np.mean(arr_i), "std: ", np.std(arr_i))
    print("C: \t\t", np.mean(arr_j), "std: ", np.std(arr_j))
    means = [np.mean(arr_a), np.mean(arr_b), np.mean(arr_c), np.mean(arr_f), np.mean(arr_g), np.mean(arr_h), np.mean(arr_i), np.mean(arr_j)]
    stds = [np.std(arr_a), np.std(arr_b), np.std(arr_c), np.std(arr_f), np.std(arr_g), np.std(arr_h), np.std(arr_i), np.std(arr_j)]
    return means, stds, predictions

# Normalization levels
## Available settings:
1. lower
2. punctuation
3. stemming
4. lemmatization
5. commonwords
6. rarewords
7. stopwords

### Levels:
- first level: no settings
- second level: lower & punctuation
- third level: second level & common words & rare words
- fourth level: third level & stopwords
- fifth level: third level & stemming
- sixth level: third level & lemmatization

In [None]:
df = Amazon_load_data()
df = textcleaning(df, lower=0, punctuation=0, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0)
x = df['text']
y = df['label']

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(x)
text_tfidf =  tfidf_vect.transform(x)
print ("WordLevel TF-IDF: ")
x_tfidf = text_tfidf

km = KMeans(n_clusters=2)
kmeans = km.fit(x_tfidf)
print("Done")

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vect.get_feature_names()
for i in range(2):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
        print()

In [None]:
df = Amazon_load_data()
df = textcleaning(df, lower=1, punctuation=1, stemming=0, lemmatization=1, commonwords=1, rarewords=1, stopword=1)
x = df['text']
y = df['label']

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(x)
text_tfidf =  tfidf_vect.transform(x)
print ("WordLevel TF-IDF: ")
x_tfidf = text_tfidf

km = KMeans(n_clusters=2)
kmeans = km.fit(x_tfidf)
print("Done")

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vect.get_feature_names()
for i in range(2):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
        print()

In [None]:
df = Amazon_load_data()
df = textcleaning(df, lower=0, punctuation=0, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0)
x = df['text']
y = df['label']

tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(x)
x_tfidf_ngram =  tfidf_vect_ngram.transform(x)
print ("ngram TF-IDF: ")

km = KMeans(n_clusters=2)
kmeans = km.fit(x_tfidf_ngram)
print("Done")

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vect_ngram.get_feature_names()
for i in range(2):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
        print()

In [None]:
df = Amazon_load_data()
df = textcleaning(df, lower=1, punctuation=1, stemming=1, lemmatization=0, commonwords=1, rarewords=1, stopword=1)
x = df['text']
y = df['label']

tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(x)
x_tfidf_ngram =  tfidf_vect_ngram.transform(x)
print ("ngram TF-IDF: ")

km = KMeans(n_clusters=2)
kmeans = km.fit(x_tfidf_ngram)
print("Done")

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vect_ngram.get_feature_names()
for i in range(2):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
        print()

In [None]:
df = Yelp_load_data()
df = textcleaning(df, lower=0, punctuation=0, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0)
x = df['text']
y = df['label']

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in x]

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=10, 
                                  window=10, min_count = 1,
                                  sample=1e-3, iter=100)

vocabulary = set(w2v_model.wv.index2word)

def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

features = [average_word_vectors(tokenized_sentence, w2v_model, vocabulary, 10)
                    for tokenized_sentence in tokenized_corpus]

# word embeddings
print("W2V:")
w2v_feature_array = np.array(features)
km = KMeans(n_clusters=2)
kmeans = km.fit(w2v_feature_array)
print("Done")

word_vectors = w2v_feature_array
idx = km.fit_predict(word_vectors);
centroid_map = dict(zip(w2v_model.wv.index2word, idx))

def get_top_words(index2word, k, centers, wordvecs):
    tree = KDTree(wordvecs);
#Closest points for each Cluster center is used to query the closest 20 points to it.
    closest_points = [tree.query(np.reshape(x, (1, -1)), k=k) for x in centers];
    closest_words_idxs = [x[1] for x in closest_points];
#Word Index is queried for each position in the above array, and added to a Dictionary.
    closest_words = {};
    for i in range(0, len(closest_words_idxs)):
        closest_words['Cluster #' + str(i)] = [index2word[j] for j in closest_words_idxs[i][0]]
#A DataFrame is generated from the dictionary.
    df = pandas.DataFrame(closest_words);
    df.index = df.index+1
    return df

centers = km.cluster_centers_
top_words = get_top_words(w2v_model.wv.index2word, 10, centers, word_vectors)
top_words

In [None]:
df = Yelp_load_data()
df = textcleaning(df, lower=1, punctuation=1, stemming=0, lemmatization=0, commonwords=1, rarewords=1, stopword=1)
x = df['text']
y = df['label']

print("Glove:")
model=emb_glove()
glove_feature_array = averaged_word_vectorizer(corpus=tokenizecorpus(x), model = model, num_features=300)
km = KMeans(n_clusters=2)
kmeans = km.fit(glove_feature_array)
print("Done")

word_vectors = glove_feature_array
idx = km.fit_predict(word_vectors);
centroid_map = dict(zip(model.wv.index2word, idx))

def get_top_words(index2word, k, centers, wordvecs):
    tree = KDTree(wordvecs);
#Closest points for each Cluster center is used to query the closest k points to it.
    closest_points = [tree.query(np.reshape(x, (1, -1)), k=k) for x in centers];
    closest_words_idxs = [x[1] for x in closest_points];
#Word Index is queried for each position in the above array, and added to a Dictionary.
    closest_words = {};
    for i in range(0, len(closest_words_idxs)):
        closest_words['Cluster #' + str(i)] = [index2word[j] for j in closest_words_idxs[i][0]]
#A DataFrame is generated from the dictionary.
    df = pandas.DataFrame(closest_words);
    df.index = df.index+1
    return df

centers = km.cluster_centers_
top_words = get_top_words(model.wv.index2word, 10, centers, word_vectors)
top_words

# Results Amazon data

In [None]:
def use_amazondata(r=1, lower=0, punctuation=0, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0):
    measures = ['accuracy', 'ASW', 'ARI', 'true ASW', 'AMI', 'NMI', 'H', 'C']
    #total_eval = pandas.DataFrame(index = measures)
    
    df = Amazon_load_data()
    df = textcleaning(df, lower=lower, punctuation=punctuation, stemming=stemming, lemmatization=lemmatization, commonwords=commonwords, rarewords=rarewords, stopword=stopword)
    x = df['text']
    y = df['label']
    
    print ("WordLevel TF-IDF: ")
    x_tfidf = wordtfidf(x)
    col = ['mean_tfidf', 'std_tfidf']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_tfidf'], evalsdf['std_tfidf'], df['preds_1'] = k_means(x_tfidf, y, 2, repeat = r)
    #total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    
    print ("N-Gram Vectors: ")
    x_tfidf_ngram = ngramtfidf(x)
    col = ['mean_tfidf_ngram', 'std_tfidf_ngram']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_tfidf_ngram'], evalsdf['std_tfidf_ngram'], df['preds_2'] = k_means(x_tfidf_ngram, y, 2, repeat = r)
    #total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    # word embeddings
    print("W2V:")
    w2v_feature_array = averaged_word_vectorizer(corpus=tokenizecorpus(x), model=emb_W2V(tokenizecorpus(x)), num_features=10)
    col = ['mean_w2v', 'std_w2v']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_w2v'], evalsdf['std_w2v'], df['preds_3'] = k_means(w2v_feature_array, y, 2, repeat=r)
    #total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    print("Glove:")
    glove_feature_array = averaged_word_vectorizer(corpus=tokenizecorpus(x), model=emb_glove(), num_features=300)
    col = ['mean_glove', 'std_glove']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_glove'], evalsdf['std_glove'], df['preds_4'] = k_means(glove_feature_array, y, 2, repeat=r)
    #total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    return df

In [None]:
#level 1 amazon data
df_1 = use_amazondata(r=1, lower=0, punctuation=0, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0)
df_1.head()

In [None]:
df_1.to_csv('AMZ_1_labels.csv', encoding='utf-8')

In [None]:
#level 2 amazon data
df_2 = use_amazondata(r=1, lower=1, punctuation=1, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0)
df_2.to_csv('AMZ_2_labels.csv', encoding='utf-8')

In [None]:
df_3 = use_amazondata(r=1, lower=1, punctuation=1, stemming=0, lemmatization=0, commonwords=1, rarewords=1, stopword=0)
df_3.to_csv('AMZ_3_labels.csv', encoding='utf-8')

In [None]:
df_4 = use_amazondata(r=1, lower=1, punctuation=1, stemming=0, lemmatization=0, commonwords=1, rarewords=1, stopword=1)
df_4.to_csv('AMZ_4_labels.csv', encoding='utf-8')

In [None]:
# level 5
df_5 = use_amazondata(r=1, lower=1, punctuation=1, stemming=1, lemmatization=0, commonwords=1, rarewords=1, stopword=1)
df_5.to_csv('AMZ_5_labels.csv', encoding='utf-8')

In [None]:
# level 6
df_6 = use_amazondata(r=1, lower=1, punctuation=1, stemming=0, lemmatization=1, commonwords=1, rarewords=1, stopword=1)
df_6.to_csv('AMZ_6_labels.csv', encoding='utf-8')

# Results Yelp Data

In [None]:
data = Yelp_load_data()
data.shape

In [None]:
def use_yelpdata(r=1, lower=0, punctuation=0, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0):
    measures = ['accuracy', 'ASW', 'ARI', 'true ASW', 'AMI', 'NMI', 'H', 'C']
    total_eval = pandas.DataFrame(index = measures)
    
    df = Yelp_load_data()
    df = textcleaning(df, lower=lower, punctuation=punctuation, stemming=stemming, lemmatization=lemmatization, commonwords=commonwords, rarewords=rarewords, stopword=stopword)
    x = df['text']
    y = df['label']
    
    print ("WordLevel TF-IDF: ")
    x_tfidf = wordtfidf(x)
    col = ['mean_tfidf', 'std_tfidf']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_tfidf'], evalsdf['std_tfidf'] = k_means(x_tfidf, y, 2, repeat = r)
    total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    print ("N-Gram Vectors: ")
    x_tfidf_ngram = ngramtfidf(x)
    col = ['mean_tfidf_ngram', 'std_tfidf_ngram']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_tfidf_ngram'], evalsdf['std_tfidf_ngram'] = k_means(x_tfidf_ngram, y, 2, repeat = r)
    total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    # word embeddings
    print("W2V:")
    w2v_feature_array = averaged_word_vectorizer(corpus=tokenizecorpus(x), model=emb_W2V(tokenizecorpus(x)), num_features=10)
    col = ['mean_w2v', 'std_w2v']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_w2v'], evalsdf['std_w2v'] = k_means(w2v_feature_array, y, 2, repeat=r)
    total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    print("Glove:")
    glove_feature_array = averaged_word_vectorizer(corpus=tokenizecorpus(x), model=emb_glove(), num_features=300)
    col = ['mean_glove', 'std_glove']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_glove'], evalsdf['std_glove'] = k_means(glove_feature_array, y, 2, repeat=r)
    total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    return total_eval

In [None]:
#level 1 yelp
level1yelp = use_yelpdata(r=10, lower=0, punctuation=0, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0)
level1yelp.to_csv('YELP_level1.csv', encoding='utf-8')
level1yelp

In [None]:
#level 2
level2yelp = use_yelpdata(r=10, lower=1, punctuation=1, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0)
level2yelp.to_csv('YELP_level2.csv', encoding='utf-8')
level2yelp

In [None]:
#level 3
level3yelp = use_yelpdata(r=10, lower=1, punctuation=1, stemming=0, lemmatization=0, commonwords=1, rarewords=1, stopword=0)
level3yelp.to_csv('YELP_level3.csv', encoding='utf-8')
level3yelp

In [None]:
#level 4
level4yelp = use_yelpdata(r=10, lower=1, punctuation=1, stemming=0, lemmatization=0, commonwords=1, rarewords=1, stopword=1)
level4yelp.to_csv('YELP_level4.csv', encoding='utf-8')
level4yelp

In [None]:
#level 5
level5yelp = use_yelpdata(r=10, lower=1, punctuation=1, stemming=1, lemmatization=0, commonwords=1, rarewords=1, stopword=1)
level5yelp.to_csv('YELP_level5.csv', encoding='utf-8')
level5yelp

In [None]:
#level 6
level6yelp = use_yelpdata(r=10, lower=1, punctuation=1, stemming=0, lemmatization=1, commonwords=1, rarewords=1, stopword=1)
level6yelp.to_csv('YELP_level6.csv', encoding='utf-8')
level6yelp

# Results DBpedia data

In [None]:
data = Dbpedia_load_data()
data.head()

In [None]:
def use_dbpediadata(r=1, lower=0, punctuation=0, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0):
    measures = ['accuracy', 'ASW', 'ARI', 'true ASW', 'AMI', 'NMI', 'H', 'C']
    total_eval = pandas.DataFrame(index = measures)
    
    df = Dbpedia_load_data()
    df = textcleaning(df, lower=lower, punctuation=punctuation, stemming=stemming, lemmatization=lemmatization, commonwords=commonwords, rarewords=rarewords, stopword=stopword)
    x = df['text']
    y = df['label']
    
    print ("WordLevel TF-IDF: ")
    x_tfidf = wordtfidf(x)
    col = ['mean_tfidf', 'std_tfidf']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_tfidf'], evalsdf['std_tfidf'] = k_means(x_tfidf, y, 2, repeat = r)
    total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    print ("N-Gram Vectors: ")
    x_tfidf_ngram = ngramtfidf(x)
    col = ['mean_tfidf_ngram', 'std_tfidf_ngram']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_tfidf_ngram'], evalsdf['std_tfidf_ngram'] = k_means(x_tfidf_ngram, y, 2, repeat = r)
    total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    # word embeddings
    print("W2V:")
    w2v_feature_array = averaged_word_vectorizer(corpus=tokenizecorpus(x), model=emb_W2V(tokenizecorpus(x)), num_features=10)
    col = ['mean_w2v', 'std_w2v']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_w2v'], evalsdf['std_w2v'] = k_means(w2v_feature_array, y, 2, repeat=r)
    total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    print("Glove:")
    glove_feature_array = averaged_word_vectorizer(corpus=tokenizecorpus(x), model=emb_glove(), num_features=300)
    col = ['mean_glove', 'std_glove']
    evalsdf = pandas.DataFrame(index = measures, columns=col)
    evalsdf['mean_glove'], evalsdf['std_glove'] = k_means(glove_feature_array, y, 2, repeat=r)
    total_eval = pandas.concat([total_eval, evalsdf], axis=1)
    
    return total_eval

In [None]:
#level 1 dbpedia data
level1dbpedia = use_dbpediadata(r=10, lower=0, punctuation=0, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0)
level1dbpedia.to_csv('DBPEDIA_level1.csv', encoding='utf-8')
level1dbpedia

In [None]:
#level 2
level2dbpedia = use_dbpediadata(r=10, lower=1, punctuation=1, stemming=0, lemmatization=0, commonwords=0, rarewords=0, stopword=0)
level2dbpedia.to_csv('DBPEDIA_level2.csv', encoding='utf-8')
level2dbpedia

In [None]:
#level 3
level3dbpedia = use_dbpediadata(r=10, lower=1, punctuation=1, stemming=0, lemmatization=0, commonwords=1, rarewords=1, stopword=0)
level3dbpedia.to_csv('DBPEDIA_level3.csv', encoding='utf-8')
level3dbpedia

In [None]:
#level 4
level4dbpedia = use_dbpediadata(r=10, lower=1, punctuation=1, stemming=0, lemmatization=0, commonwords=1, rarewords=1, stopword=1)
level4dbpedia.to_csv('DBPEDIA_level4.csv', encoding='utf-8')
level4dbpedia

In [None]:
#level 5
level5dbpedia = use_dbpediadata(r=10, lower=1, punctuation=1, stemming=1, lemmatization=0, commonwords=1, rarewords=1, stopword=1)
level5dbpedia.to_csv('DBPEDIA_level5.csv', encoding='utf-8')
level5dbpedia

In [None]:
# level 6
level6dbpedia = use_dbpediadata(r=10, lower=1, punctuation=1, stemming=0, lemmatization=1, commonwords=1, rarewords=1, stopword=1)
level6dbpedia.to_csv('DBPEDIA_level6.csv', encoding='utf-8')
level6dbpedia