In [9]:
### Installs ###
!pip install apyori



In [12]:
### IMPORTS ###
#Clustering
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from apyori import apriori

# String Processing
import string

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


In [15]:
### FUNCTIONS ###
def plot_dendrogram(model, **kwargs):
    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)
    
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                         rule_set.support, rule.confidence, rule.lift]) 
    
    # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift']) 

def lemmatize(token, tag):
    tag = {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(tag[0], wn.NOUN)

    return lemmatizer.lemmatize(token, tag)

def cab_tokenizer(document):
    # initialize token list
    tokens = []
    
    # split the document into sentences
    for sent in sent_tokenize(document):
        # split the document into tokens and then create part of speech tag for each token
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # preprocess and remove unnecessary characters
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If stopword, ignore token and continue
            if token in stopwords:
                continue

            # If punctuation, ignore token and continue
            if all(char in punct for char in token):
                continue

            # Lemmatize the token and add back to the tokens list
            lemma = lemmatize(token, tag)
            tokens.append(lemma)
    
    return tokens

# function to visualise text cluster. Useful for the assignment too :)
def visualise_text_cluster(n_clusters, cluster_centers, terms, num_word = 5):
    # -- Params --
    # cluster_centers: cluster centers of fitted/trained KMeans/other centroid-based clustering
    # terms: terms used for clustering
    # num_word: number of terms to show per cluster. Change as you please.
    
    # find features/terms closest to centroids
    ordered_centroids = cluster_centers.argsort()[:, ::-1]
    
    for cluster in range(n_clusters):
        print("Top terms for cluster {}:".format(cluster), end=" ")
        for term_idx in ordered_centroids[cluster, :5]:
            print(terms[term_idx], end=', ')
            
# creating tf-idf terms - a bit slow, do it occasionaly
def calculate_tf_idf_terms(document_col):
    # Param - document_col: collection of raw document text that you want to analyse
    from sklearn.feature_extraction.text import CountVectorizer

    # use count vectorizer to find TF and DF of each term
    count_vec = CountVectorizer(tokenizer=cab_tokenizer, ngram_range=(1,2))
    X_count = count_vec.fit_transform(df['Text'])
    
    # create list of terms and their tf and df
    terms = [{'term': t, 'idx': count_vec.vocabulary_[t],
              'tf': X_count[:, count_vec.vocabulary_[t]].sum(),
              'df': X_count[:, count_vec.vocabulary_[t]].count_nonzero()}
             for t in count_vec.vocabulary_]
    
    return terms

# visualisation of ZIPF law
def visualise_zipf(terms, itr_step = 50):
    from scipy.spatial.distance import euclidean
    from math import sqrt
    
    # --- Param ---
    # terms: collection of terms dictionary from calculate_tf_idf_terms function
    # itr_step: used to control how many terms that you want to plot. Num of terms to plot = N terms / itr_step
    
    # sort terms by its frequency
    terms.sort(key=lambda x: (x['tf'], x['df']), reverse=True)
    
    # select a few of the terms for plotting purpose
    sel_terms = [terms[i] for i in range(0, len(terms), itr_step)]
    labels = [term['term'] for term in sel_terms]
    
    # plot term frequency ranking vs its DF
    plt.plot(range(len(sel_terms)), [x['df'] for x in sel_terms])
    plt.xlabel('Term frequency ranking')
    plt.ylabel('Document frequency')
    
    max_x = len(sel_terms)
    max_y = max([x['df'] for x in sel_terms])
    
    # annotate the points
    prev_x, prev_y = 0, 0
    for label, x, y in zip(labels,range(len(sel_terms)), [x['df'] for x in sel_terms]):
        # calculate the relative distance between labels to increase visibility
        x_dist = (abs(x - prev_x) / float(max_x)) ** 2
        y_dist = (abs(y - prev_y) / float(max_y)) ** 2
        scaled_dist = sqrt(x_dist + y_dist)
        
        if (scaled_dist > 0.1):
            plt.text(x+2, y+2, label, {'ha': 'left', 'va': 'bottom'}, rotation=30)
            prev_x, prev_y = x, y
    
    plt.show()

In [4]:
### PRE-PROCESSING ###
def _PerpData (df):
    
    # Get the names of the columns
    _names = list(df)
    
    # loop through the data and change all null to np.nan
    for _name in _names:
        df[_name] = df[_name].replace('', np.nan)

    return df

In [None]:
### MAIN ###
# Read the Data
data = pd.read_csv('census2000.csv', na_filter=False)
data = _PerpData(data)

<h1>Clustering</h1>
<h3>K-Means<h3/>

In [None]:
### KMeans ###
# take 3 variables and drop the rest
data_kmeans = data[['MedHHInc', 'MeanHHSz', 'RegDens']]

# convert df2 to matrix
X = data_kmeans.as_matrix()

# scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# random state, we will use 42 instead of 10 for a change
rs = 42

# set the random state. different random state seeds might result in different centroids locations
model = KMeans(n_clusters=3, random_state=rs)
model.fit(X)

# sum of intra-cluster distances
print("Sum of intra-cluster distance:", model.inertia_)

print("Centroid locations:")
for centroid in model.cluster_centers_:
    print(centroid)
    
# set a different n_clusters
model = KMeans(n_clusters=8, random_state=rs)
model.fit(X)

# sum of intra-cluster distances
print("Sum of intra-cluster distance:", model.inertia_)

print("Centroid locations:")
for centroid in model.cluster_centers_:
    print(centroid)
    


<h4>Agglomerative Clustering | (Alternative to Kmeans Clustering)</h4>

In [None]:
agg_model = AgglomerativeClustering(n_clusters=3)
agg_model.fit(X[:50]) # subset of X, only 50 data points

plot_dendrogram(agg_model, labels=agg_model.labels_)
plt.show()

<h3>Understanding and Visualising a Clustering Model</h3>
<h4>Method 1: Pair Plots</h4>

In [None]:
model = KMeans(n_clusters=3, random_state=rs).fit(X)

# assign cluster ID to each record in X
# Ignore the warning, does not apply to our case here
y = model.predict(X)
data_kmeans['Cluster_ID'] = y

# how many records are in each cluster
print("Cluster membership")
print(data_kmeans['Cluster_ID'].value_counts())

# pairplot the cluster distribution.
cluster_g = sns.pairplot(data_kmeans, hue='Cluster_ID')
plt.show()

<h4>Method 2: Distribution Plots</h4>

In [None]:
# prepare the column and bin size. Increase bin size to be more specific, but 20 is more than enough
cols = ['MedHHInc', 'MeanHHSz', 'RegDens']
n_bins = 20

# inspecting cluster 0 and 1
clusters_to_inspect = [0,1]

for cluster in clusters_to_inspect:
    # inspecting cluster 0b
    print("Distribution for cluster {}".format(cluster))

    # create subplots
    fig, ax = plt.subplots(nrows=3)
    ax[0].set_title("Cluster {}".format(cluster))

    for j, col in enumerate(cols):
        # create the bins
        bins = np.linspace(min(data_kmeans[col]), max(data_kmeans[col]), 20)
        # plot distribution of the cluster using histogram
        sns.distplot(data_kmeans[data_kmeans['Cluster_ID'] == cluster][col], bins=bins, ax=ax[j], norm_hist=True)
        # plot the normal distribution with a black line
        sns.distplot(data_kmeans[col], bins=bins, ax=ax[j], hist=False, color="k")

    plt.tight_layout()
    plt.show()

<h3>Determine K</h3>
<h4>Method 1: Elbow method </h4>

In [None]:
# list to save the clusters and cost
clusters = []
inertia_vals = []

# this whole process should take a while
for k in range(2, 15, 2):
    # train clustering with the specified K
    model = KMeans(n_clusters=k, random_state=rs, n_jobs=10)
    model.fit(X)
    
    # append model to cluster list
    clusters.append(model)
    inertia_vals.append(model.inertia_)

# plot the inertia vs K values
plt.plot(range(2,15,2), inertia_vals, marker='*')
plt.show()

<h4>Method 2: Silhoette Score </h4>

In [None]:
print(clusters[1])
print("Silhouette score for k=4", silhouette_score(X, clusters[1].predict(X)))

print(clusters[2])
print("Silhouette score for k=6", silhouette_score(X, clusters[2].predict(X)))

<h2>Performing Association Mining</h2>
<h3>Apriori algorithm</h3>
<h4>Create Transactions</h4>

In [None]:
# group by account, then list all services
transactions = df.groupby(['GROUP_BY_FIELD'])['PROPERTY_FIELD'].apply(list)

print(transactions.head(5))

<b><u>Possible Parameters</u></b>

1. `transactions`: list of list of items in transactions (eg. [['A', 'B'], ['B', 'C']]).
2. `min_support`: Minimum support of relations in float percentage. It specifies a minimum level of support to claim that items are associated (i.e. they occur together in the dataset). Default 0.1.
3. `min_confidence`: Minimum confidence of relations in float percentage. Default 0.0.
4. `min_lift`: Minimum lift of relations in float percentage. Default 0.0.
5. `max_length`: Max length of the relations. Default None.

In [None]:
# type cast the transactions from pandas into normal list format and run apriori
transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support=0.05))

# print first 5 rules
print(results[:5])

result_df = convert_apriori_results_to_pandas_df(results)

print(result_df.head(20))

## Get the most appropriate variables ##
# sort all acquired rules descending by lift
result_df = result_df.sort_values(by='Lift', ascending=False)
print(result_df.head(10))

<h2>Text Mining</h2>
<h3>Data reading</h3>
<h4>Loading Text</h4>

In [None]:
# load the dataset
df = pd.read_json('datasets/Federalistpapers.json')

# random state
rs = 42

<h4>Data Exploration</h4>

In [None]:
# as usual, explore the dataset
df.info()

# print out the first 200 characters of the first row of text column
print(df.get_value(index=0, col='Text')[:200])

# average length of text column
print(df['Text'].apply(lambda x: len(x)).mean())

<h3>Data Prepocessing</h3>
<h4>Bag Of Words</h4>

1. **Lowercase**: cast each word into its lowercased version. Ensure differently capitalised words are treated as the same word.
2. **Punctuation** removal: remove all punctuation marks. We are only interested at the words.
3. **Part of speech** filtering: keep tokens with certain part-of-speech only. In particular, here we are interested in adjectives, adverbs, nouns and verbs.
4. **Lemmatisation**: reducing words/tokens into their base form. 

> #### Lemmatisation vs stemming
>   There are minor diffrences between Lemmatisation and **stemming** that was
>   introduced in the lecture. Stemming usually refers to a heuristic process
>   that chops off the ends of words to get the base forms of the words.
>   Lemmatisation usually refers the process of doing similar things, but using
>   additional information of how the word is used in the context (semantic
>   information). Lemmatisation returns the dictionary form of a word, also
>   known as lemma. Given the word "ponies" for example, stemming will return
>   "poni" (incorrect), while lemmatisation will return "pony" (correct). The
>   drawback of lemmatisation is it requires additional analysis to obtain the
>   required semantic information.
>   [Source](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)

In [None]:
# initialise WordNet lemmatizer and punctuation filter
lemmatizer = WordNetLemmatizer()
punct = set(string.punctuation)

# load the provided stopwords
df_stop = pd.read_json('datasets/Federaliststop.json')

# join provided stopwords with the default NLTK English stopwords
stopwords = set(df_stop['Term']).union(set(sw.words('english')))

# tf idf vectoriser
tfidf_vec = TfidfVectorizer(tokenizer=cab_tokenizer, ngram_range=(1,2))
X = tfidf_vec.fit_transform(df['Text'])

# see the number of unique tokens produced by the vectorizer. Lots of them...
print(len(tfidf_vec.get_feature_names()))

<h3> Document Analysis</h3>
<h4>Kmeans</h4>

In [None]:
# Use Elbow and sollouet rule to get optimal K
# {Not Included}
# K means clustering using the term vector
kmeans = KMeans(n_clusters=7, random_state=rs).fit(X)

# visualize it
visualise_text_cluster(kmeans.n_clusters, kmeans.cluster_centers_, tfidf_vec.get_feature_names())

<h3>Feature Selection and Transformation</h3>
<h4>Zipf's Law and Document Frequency Filtering</h4>

In [None]:
terms = calculate_tf_idf_terms(df['Text'])

visualise_zipf(terms)

# another tf idf vectoriser
# limit the terms produced to terms that occured in min of 2 documents and max 80% of all documents
filter_vec = TfidfVectorizer(tokenizer=cab_tokenizer, ngram_range=(1,2), min_df=2, max_df=0.8)
X_filter = filter_vec.fit_transform(df['Text'])

# see the number of unique tokens produced by the vectorizer. Reduced!
print(len(filter_vec.get_feature_names()))

## TESTING THE FILTER ##
# K means clustering using the new term vector, time it for comparison to SVD
kmeans_fil = KMeans(n_clusters=7, random_state=rs).fit(X_filter)
# visualisation
visualise_text_cluster(kmeans_fil.n_clusters, kmeans_fil.cluster_centers_, filter_vec.get_feature_names())

<h4>Singlular Value Decompisition</h4>

In [None]:
svd = TruncatedSVD(n_components=100, random_state=42)
X_trans = svd.fit_transform(X_filter)

# sort the components by largest weighted word
sorted_comp = svd.components_.argsort()[:, ::-1]
terms = filter_vec.get_feature_names()

# visualise word - concept/component relationships
for comp_num in range(10):
    print("Top terms in component #{}".format(comp_num), end=" ")
    for i in sorted_comp[comp_num, :5]:
        print(terms[i], end=", ")
    print()
    
# K-means clustering using LSA-transformed X
svd_kmeans = KMeans(n_clusters=7, random_state=rs).fit(X_trans)

# transform cluster centers back to original feature space for visualisation
original_space_centroids = svd.inverse_transform(svd_kmeans.cluster_centers_)

# visualisation
visualise_text_cluster(svd_kmeans.n_clusters, original_space_centroids, filter_vec.get_feature_names())