# K-means clustering

## Imports

In [11]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
from nltk.corpus import stopwords



df = pd.read_csv("questions1.csv")
df1 = df["Question"]
print df1[0:10]

0                   Is the person wearing glasses?
1                      Does the person have bangs?
2                      Does the person have bangs?
3                Does the person have blonde hair?
4                 Does the person have short hair?
5            Does the person ave her hair tied up?
6    Does the person have her hair in a ponytail? 
7          Does the person have her hair in a bun?
8                    Does the person have glasses?
9                      Does the person have bangs?
Name: Question, dtype: object


## Preprocessing manual

In [13]:
allsentences = []

for items in df1:
    sentence = []
    for words in items.split():
        #print words
        pattern = re.search(r'(.*)(\[comma])',words)
        if pattern:
            sentence.append(pattern.group(1).lower())
        else:
            sentence.append(words.lower())
    allsentences.append(sentence)

lista=  []

for items in allsentences:
    captured = " ".join(items)
    lista.append(captured)
    
listb = []

for items in lista:
    summing = []
    items = items.split()
    for words in items:
        words = words.strip("\',.")
        #print words
        if words not in stopwords.words("english"):
            summing.append(words)
    listb.append(summing)
    
listc=  []

for items in listb:
    captured = " ".join(items)
    listc.append(captured)
    
print listc[0:10]   

['person wearing glasses?', 'person bangs?', 'person bangs?', 'person blonde hair?', 'person short hair?', 'person ave hair tied up?', 'person hair ponytail?', 'person hair bun?', 'person glasses?', 'person bangs?']


### Optional - Removing of some specific words

In [14]:
listd = []

for items in listc:
    temporar = []
    #print items.split()
    for words in items.split():
        if "person" in words:
            continue
        elif "wearing" in words:
            continue
        elif "wear" in words:
            continue
        else:
            temporar.append(words)
    listd.append(temporar)
    
liste=  []


for items in listd:
    captured = " ".join(items).strip("?")
    liste.append(captured)
    
for items in range (10):
    print liste[items]

glasses
bangs
bangs
blonde hair
short hair
ave hair tied up
hair ponytail
hair bun
glasses
bangs


## Preprocessing automatic

In [15]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [16]:
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

totalvocab_stemmed = []
totalvocab_tokenized = []
for i in liste:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [17]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print 'there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame'

there are 5136 items in vocab_frame


## Tf-idf and document similarity

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(listc) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

Wall time: 1.15 s
(3237, 3)


In [19]:
terms = tfidf_vectorizer.get_feature_names()

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

## K means

In [20]:
from sklearn.cluster import KMeans

num_clusters = 6

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 330 ms


In [21]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()
#print clusters

In [22]:
product = {'Question': listc, 'Cluster': clusters}
frame = pd.DataFrame(product, index = [clusters] , columns = ["Cluster",'Question'])
frame['Cluster'].value_counts()

1    942
0    938
3    404
4    367
2    340
5    246
Name: Cluster, dtype: int64

##### Print a sample of cluster 5

In [25]:
frame.loc[frame['Cluster'] == 5][0:10]

Unnamed: 0,Cluster,Question
5,5,see shirt wearing black shirt?
5,5,wearing glasses?
5,5,wearing glasses?
5,5,wearing different shirt everyone else?
5,5,wear glasses?
5,5,wearing two shirts?
5,5,wearing necklace?
5,5,wearing necklace?
5,5,wearing shirt word life impact it?
5,5,wearing collared shirt?
