In [1]:
import pandas as pd
import numpy as np
import sys
import matplotlib
import matplotlib.pyplot as plt
from contextlib import contextmanager
from functools import wraps
import os

from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise_distances

In [2]:
print("Python version")
print (sys.version)
print("Version info.")
print (sys.version_info)

Python version
3.5.2 (default, Oct  8 2019, 13:06:37) 
[GCC 5.4.0 20160609]
Version info.
sys.version_info(major=3, minor=5, micro=2, releaselevel='final', serial=0)


In [3]:
import pickle
def save(mapping_dict, name):
    with open('data/{}.pickle'.format(name), 'wb') as handle:
        pickle.dump(mapping_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def load(name):
    with open('data/{}.pickle'.format(name), 'rb') as handle:
        return pickle.load(handle)
    

def save_result(file_name=None, calculate=False, skip=False):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            if calculate:
                result = func(*args, **kwargs)
                if file_name:
                    if isinstance(result, pd.DataFrame):
                        result.to_parquet('data/{}.parquet'.format(file_name))
                    else:
                        save(result, file_name)
                return result
            else:
                if not skip and file_name:
                    if os.path.isfile('data/{}.parquet'.format(file_name)) :
                        return pd.read_parquet('data/{}.parquet'.format(file_name))
                    else:
                        return load(file_name)                    
                else:
                    display('code skipped')
                    return
        return wrapper

    return decorator

# Prepare DataSet

In [4]:
from sklearn.datasets import fetch_20newsgroups

In [16]:
def generate_new_clusters(obj, mapping):
    for v in mapping.values():
        for i in obj:
            if v.startswith(i):
                yield i

@save_result('mydata')
def load_dataset():
    mydata = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), shuffle=True)
    display('Training data size:', len(mydata['data']))
    return mydata 

@save_result('mappings')
def generate_mappings(mydata):
    a = [np.unique(['.'.join(i.split('.')[:k]) for i in mydata.target_names]) for k in range(1,4)]
    targets = np.unique(mydata.target)
    mapping1 = dict(zip(targets, a[2]))
    mapping2 = dict(zip(targets, generate_new_clusters(a[1], mapping1)))
    mapping3 = dict(zip(targets, generate_new_clusters(a[0], mapping1)))
    return mapping1, mapping2, mapping3

In [17]:
dset = load_dataset()

In [7]:
res = generate_mappings(dset)
if res:
    for mapping in res:
        print(mapping, '\n')

{0: 'alt.atheism', 1: 'comp.graphics', 2: 'comp.os.ms-windows', 3: 'comp.sys.ibm', 4: 'comp.sys.mac', 5: 'comp.windows.x', 6: 'misc.forsale', 7: 'rec.autos', 8: 'rec.motorcycles', 9: 'rec.sport.baseball', 10: 'rec.sport.hockey', 11: 'sci.crypt', 12: 'sci.electronics', 13: 'sci.med', 14: 'sci.space', 15: 'soc.religion.christian', 16: 'talk.politics.guns', 17: 'talk.politics.mideast', 18: 'talk.politics.misc', 19: 'talk.religion.misc'} 

{0: 'alt.atheism', 1: 'comp.graphics', 2: 'comp.os', 3: 'comp.sys', 4: 'comp.sys', 5: 'comp.windows', 6: 'misc.forsale', 7: 'rec.autos', 8: 'rec.motorcycles', 9: 'rec.sport', 10: 'rec.sport', 11: 'sci.crypt', 12: 'sci.electronics', 13: 'sci.med', 14: 'sci.space', 15: 'soc.religion', 16: 'talk.politics', 17: 'talk.politics', 18: 'talk.politics', 19: 'talk.religion'} 

{0: 'alt', 1: 'comp', 2: 'comp', 3: 'comp', 4: 'comp', 5: 'comp', 6: 'misc', 7: 'rec', 8: 'rec', 9: 'rec', 10: 'rec', 11: 'sci', 12: 'sci', 13: 'sci', 14: 'sci', 15: 'soc', 16: 'talk', 17: '

In [22]:
TARGET_COLUMNS = ['clusters', 'clusters_2', 'clusters_3']
TEXT_COLUMN = 'data'

In [25]:
@save_result('df')
def prepare_raw_df(mydata, mapping1, mapping2, mapping3):
    df = pd.DataFrame({'data': mydata.data, 'target': mydata.target})
    for mapping, col in zip([mapping1, mapping2, mapping3], TARGET_COLUMNS):
        df[col] = df['target'].apply(lambda x: mapping1[x])
    return df

In [26]:
prepare_raw_df(dset, *generate_mappings(dset)).head()

Unnamed: 0,data,target,clusters,clusters_2,clusters_3
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,rec.sport,rec
1,My brother is in the market for a high-perform...,3,comp.sys.ibm,comp.sys,comp
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast,talk.politics,talk
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm,comp.sys,comp
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac,comp.sys,comp


# Preproccessing DataSet

In [None]:
df = prepare_raw_df(dset, *generate_mappings(dset))

In [None]:
def drop_duplicate_values(x):
    return x.drop_duplicates().reset_index(drop=True)

# Clusters Frequency

In [None]:
def get_frequency(df, column_name):
    targets, frequency = np.unique(df[column_name], return_counts=True)
    if column_name == TARGET_COLUMNS[2]: # 'clusters_3'
        return targets, frequency
    return range(len(targets)), frequency

def plot_clusters(func, df, title, type='bar'):
    fig, axs = plt.subplots(1, 3, figsize=(19, 3), sharey=False)
    for num, col in enumerate(TARGET_COLUMNS):
        if type == 'bar':
            axs[num].bar(*func(df, col))
        else:
            axs[num].scatter(**func(df, col))
    fig.suptitle(title)

In [None]:
%%script False
plot_clusters(get_frequency, df, 'Clusters Frequency')

In [None]:
%%script False
from sklearn.preprocessing import LabelEncoder
for col in TARGET_COLUMNS:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])    
    save(dict(enumerate(le.classes_)), col)
df = df.drop(columns=['target'])

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import tokenize

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()

from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
vader_analyzer = SentimentIntensityAnalyzer()

def word_lemmatizer(word):
    word = word.replace('_', '')
    word1 = wordnet_lemmatizer.lemmatize(word, pos = "n") # NOUNS
    word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v") # VERB
    return wordnet_lemmatizer.lemmatize(word2, pos = ("a")) # ADJ
    

def text_lemmatizer(text):
    return ' '.join(map(word_lemmatizer, text))


def lemmatizer(x):
    x[TEXT_COLUMN] = x[TEXT_COLUMN].apply(lambda text: text_lemmatizer(tokenize(remove_stopwords(text))))
    return x

def get_sentimnent(x):
    x['sentimnent'] = x[TEXT_COLUMN].apply(lambda text: vader_analyzer.polarity_scores(text))
    return x

In [None]:
def text_feature_selector(x):
    x['word_count'] = x[TEXT_COLUMN].apply(lambda text : len(str(text).split()))
    x['length'] = x[TEXT_COLUMN].apply(len)
    x['word_density'] = x['length'] / x['word_count']
    return x

In [None]:
%%script False
display(df.shape)
df = FunctionTransformer(drop_duplicate_values, validate=False).transform(df)
display(df.shape)
df = FunctionTransformer(lemmatizer, validate=False).transform(df)
display(df.shape)
df = FunctionTransformer(text_feature_selector, validate=False).transform(df)
display(df.shape)
df = FunctionTransformer(get_sentimnent, validate=False).transform(df)
display(df.shape)
df = pd.concat([df, pd.io.json.json_normalize(df['sentimnent'])], axis=1).drop(columns=['sentimnent'])
display(df.shape)

df.to_parquet('data/preprocessed_df.parquet')

In [None]:
df = pd.read_parquet('data/preprocessed_df.parquet')

In [None]:
df.shape

In [None]:
from functools import partial
def get_word_density(colname, feature):
    tdf = df.groupby([colname]).agg(feature).mean()
    return tdf.index, tdf.values

In [None]:
plot_clusters(partial(get_word_density, feature='word_density') , 'Clusters word_density')

In [None]:
# plot_clusters(partial(get_word_density, feature='length'), 'Clusters length')

In [None]:
# plot_clusters(partial(get_word_density, feature='word_count'), 'Clusters word_count')

In [None]:
from wordcloud import WordCloud

def clusters_wordcloud(name):
    mapping = load(name)
    pdf = pd.DataFrame(df.groupby(name).agg('data').sum())

    fig = plt.figure(figsize=(20, 25))
    for i in range(len(mapping)):
        ax = fig.add_subplot(7,3,i+1)
        wordcloud = WordCloud().generate(pdf['data'][i])
        ax.set_title("WordCloud " + mapping[pdf.index[i]])
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.axis('off')

In [None]:
# clusters_wordcloud('clusters')

In [None]:
# clusters_wordcloud('clusters_2')

In [None]:
# clusters_wordcloud('clusters_3')

In [None]:
df.head()

# TF-iDF

In [None]:
text_data = df[TEXT_COLUMN].tolist()

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(text_data)

In [None]:
vectors

In [None]:
cosine_similarity(vectors).shape

In [None]:
def reduced_results(col, results):
    results_pd = pd.concat([df[[col]], pd.DataFrame(data=results, index=df.index, columns=["x", "y"])], axis=1)
    results_pd = results_pd[results_pd['x'] < 60]
    return {
        'x': results_pd['x'].values,
        'y': results_pd['y'].values,
        'c': results_pd[col].values
    }

def plot_clusters_svd(vectors):
    svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
    plot_clusters(partial(reduced_results, results=svd.fit_transform(vectors)), 'scatter plot for clusters SVD', type='lld')

def plot_clusters_two_decompositions(vectors, with_text_features=True, n_components=300):
    svd = TruncatedSVD(n_components=15, n_iter=7, random_state=42)
    features = svd.fit_transform(vectors)

    res = features
    if with_text_features:
        tmp = df[['word_count', 'length', 'word_density', 'compound', 'neg', 'neu', 'pos']]
        tmp = SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(tmp)
        tmp = StandardScaler().fit_transform(tmp)
        res = np.concatenate([np.array(tmp),features], axis=1)
    pca = PCA(n_components=2, random_state=42)
    features2 = pca.fit_transform(res)
    plot_clusters(partial(reduced_results, results=features2), 'scatter plot for clusters SVD & then PCA', type='lld')

In [None]:
# plot_clusters_svd(vectors)

In [None]:
# plot_clusters_two_decompositions(vectors)

In [None]:
# plot_clusters_two_decompositions(vectors, with_text_features=False, n_components=15)

In [None]:
# plot_clusters_two_decompositions(vectors, with_text_features=False, n_components=300)

In [None]:
# plot_clusters_two_decompositions(vectors, with_text_features=False, n_components=1000)

In [None]:
def plot_linkage_matrix(linkage_matrix, pic_save=True, ylimit=None, title="ward_clusters", truncate_mode=None, p=5):
    fig, ax = plt.subplots(figsize=(15, 20))
    
    kwargs = {
        'leaf_rotation':90.,  # rotates the x axis labels
        'leaf_font_size':8.,  # font size for the x axis labels
        'show_contracted':False,  # to get a distribution impression in truncated branches
        'show_leaf_counts':False,  # otherwise numbers in brackets are counts
        
    }
    if truncate_mode:
        kwargs.update({
            'truncate_mode':truncate_mode,  # show only the last p merged clusters
            'p':p,  # show only the last p merged clusters
        })
        
    axs = dendrogram(linkage_matrix, **kwargs);

    if ylimit:
        ax.set_ylim(*ylimit)

    plt.tick_params(
        axis= 'x',         # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')

    plt.tight_layout(); #show plot with tight layout
    if pic_save:
        plt.savefig('{}.png'.format(title), dpi=500)

        
def illustrate_dendrogram(vectors, vtype="tfidf"):
    #define the linkage_matrix using ward clustering pre-computed distances
    linkage_matrix = ward(1 - cosine_similarity(vectors)) 
    plot_linkage_matrix(linkage_matrix, 
                        ylimit=(1.2, None), 
                        truncate_mode='level', p=5, 
                        title="dendrogram_{}_truncate_mode_{}_p_{}_with_ylimit".format(vtype, 'level', 5))
    plot_linkage_matrix(linkage_matrix, title="dendrogram_{}_full".format(vtype))
    return linkage_matrix

In [None]:
# linkage_matrix = illustrate_dendrogram(vectors, vtype="tfidf");
# # plot_linkage_matrix(linkage_matrix, ylimit=(1.2, None), truncate_mode='level', p=5)

In [None]:
# plot_linkage_matrix(linkage_matrix, truncate_mode='level', p=6)

In [None]:
# save(linkage_matrix, 'tfidf_wrap_cosdist')

In [None]:
import editdistance
from scipy.spatial.distance import pdist, squareform
editdistance.eval('banana', 'bahama')

In [None]:
# 
# Y = squareform(pdist(np.array(df['data'].values).reshape(-1,1), np.vectorize(editdistance.eval)))

In [None]:
# Y.shape

In [None]:
# save(Y, 'levdis')

In [None]:
import re
def clean_numbers(s: str) -> str:
    """Clean digits and punctuation marks from string

    :param s: input string
    :return: result string
    """
    return re.sub(r'\W', ' ', ''.join([i for i in s if not i.isdigit()]).strip().lower())

In [None]:
z = [clean_numbers(doc)[:1000] for doc in df['data'].values]

In [None]:
new_arr = np.array(z).reshape(-1,1)

In [None]:
len(new_arr)

In [None]:
b = new_arr

In [None]:
len(b)

In [None]:
# sorted({len(b[i][0]) for i in range(len(b))}, reverse=True)

In [None]:
function = np.vectorize(editdistance.eval)

In [None]:
q = squareform(pdist(b, function))

In [None]:
save_mapping(q, 'levinstein')

In [None]:
qwer = '12345678'

In [None]:
qwer-