In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle


class nlp_preprocessor:
   
    def __init__(self, vectorizer=CountVectorizer(), tokenizer=None, cleaning_function=None, 
                 stemmer=None, model=None):
        """
        A class for pipelining our data in NLP problems. The user provides a series of 
        tools, and this class manages all of the training, transforming, and modification
        of the text data.
        ---
        Inputs:
        vectorizer: the model to use for vectorization of text data
        tokenizer: The tokenizer to use, if none defaults to split on spaces
        cleaning_function: how to clean the data, if None, defaults to the in built class
        """
        if not tokenizer:
            tokenizer = self.splitter
        if not cleaning_function:
            cleaning_function = self.clean_text
        self.stemmer = stemmer
        self.tokenizer = tokenizer
        self.model = model
        self.cleaning_function = cleaning_function
        self.vectorizer = vectorizer
        self._is_fit = False
        
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')
        
    def clean_text(self, text, tokenizer, stemmer):
        """
        A naive function to lowercase all works can clean them quickly.
        This is the default behavior if no other cleaning function is specified
        """
        cleaned_text = []
        for post in text:
            cleaned_words = []
            for word in tokenizer(post):
                low_word = word.lower()
                if stemmer:
                    low_word = stemmer.stem(low_word)
                cleaned_words.append(low_word)
            cleaned_text.append(' '.join(cleaned_words))
        return cleaned_text
    
    def fit(self, text):
        """
        Cleans the data and then fits the vectorizer with
        the user provided text
        """
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
        
    def transform(self, text):
        """
        Cleans any provided data and then transforms the data into
        a vectorized format based on the fit function. Returns the
        vectorized form of the data.
        """
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        return self.vectorizer.transform(clean_text)
    
    def save_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        pickle.dump(self.__dict__, open(filename+".mdl",'wb'))
        
    def load_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        if filename[-4:] != '.mdl':
            filename += '.mdl'
        self.__dict__ = pickle.load(open(filename,'rb'))

In [3]:
import pandas as pd

def convert_ids(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('int64')

movies_metadata_df = pd.read_csv('../data/the-movies-dataset/movies_metadata.csv'
                        , converters={'id': lambda x: convert_ids(x), 'imdb_id': lambda x: convert_ids(x)}
                       ,usecols=['id', 'original_title'
                                 , 'genres', 'homepage'
                                 ,'overview', 'popularity', 'poster_path'
                                 , 'release_date', 'revenue', 'runtime'
                                 ,'spoken_languages','tagline', 'title'
                                 ,'vote_average', 'vote_count'])

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
import re
def clean(text, tokenizer, stemmer):
    """
    Cleans Text with Regexes
    :param text:
    :return: text:
    """
    doc = ''.join(text).lower()
    doc = re.sub(r'[<>\{}/;|\[\]-]', ' ', doc)
    doc = re.sub(r'[0-9]', ' ', doc)
    doc = re.sub(r'\'', ' ', doc)
    doc = re.sub(r'=', ' ', doc)
    doc = re.sub(r':', ' ', doc)
    doc = re.sub(r'"', ' ', doc)
    doc = re.sub(r'\s+', ' ', doc)
    doc = re.sub(r'\(', ' ', doc)
    doc = re.sub(r'\)', ' ', doc)
    doc = re.sub(r'\s{2,}', ' ', doc)
    doc = re.sub(r'\.', '', doc)
    doc = re.sub(r',', '', doc)

    return doc

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer

nlp_pp = nlp_preprocessor(TfidfVectorizer(stop_words='english'), cleaning_function=clean)

In [78]:
corpus = ' '
sentences = [str(sentence) for sentence in movies_metadata_df.overview.tolist()]

In [86]:
cv_tfidf = TfidfVectorizer(min_df=3, stop_words='english', ngram_range=(2,3))

In [87]:
X_tfidf = cv_tfidf.fit_transform(sentences).toarray()

In [88]:
#tfidf_model_df = pd.DataFrame(X_tfidf, columns=cv_tfidf.get_feature_names())

In [89]:
from sklearn.decomposition import TruncatedSVD

In [142]:
svd = TruncatedSVD(n_components=30)

In [143]:
tfidf_model_df = pd.DataFrame(X_tfidf, columns=cv_tfidf.get_feature_names())

In [144]:
X = svd.fit_transform(tfidf_model_df)

In [145]:
#print(svd.explained_variance_ratio_.sum())

In [146]:
### Simpler Model, no NGrams, Less Components
for idx, topic in enumerate(svd.components_):
    print("Topic %d:" % (idx + 1))
    #print(topic)
    print([(cv_tfidf.get_feature_names()[i]) for i in topic.argsort()[:-18 - 1:-1]])

Topic 1:
['new york', 'new york city', 'york city', 'year old', 'world war', 'falls love', 'war ii', 'world war ii', 'streets new york', 'streets new', 'high school', 'true story', 'young woman', 'young man', 'tells story', 'small town', 'los angeles', 'york times']
Topic 2:
['world war', 'war ii', 'world war ii', 'true story', 'year old', 'falls love', 'second world war', 'second world', 'based true', 'end world war', 'end world', 'based true story', 'united states', 'small town', 'tells story', 'german army', 'days world war', 'days world']
Topic 3:
['year old', 'old boy', 'year old boy', 'high school', 'year old girl', 'old girl', '12 year old', '12 year', 'falls love', '15 year old', '15 year', 'old son', 'year old son', 'year old daughter', 'old daughter', '17 year', '17 year old', '16 year']
Topic 4:
['falls love', 'high school', 'young woman', 'true story', 'young man', 'based true', 'based true story', 'small town', 'school student', 'high school student', 'school students', 'b

['based true', 'based true story', 'true events', 'based true events', 'film based', 'film directed', 'short film', 'romantic comedy', 'police officer', 'real life', 'life story', 'film based true', 'second world', 'second world war', 'world war', 'old man', 'feature film', 'award winning']
Topic 29:
['san francisco', 'new life', 'years ago', 'start new', 'start new life', 'police officer', 'best friends', 'middle aged', 'life story', 'true love', 'short film', '20 years', 'new year', 'returns home', 'based true', 'single mother', 'car accident', 'old boy']
Topic 30:
['middle aged', 'years ago', 'middle aged man', 'aged man', 'based true', 'kung fu', 'based true story', 'christmas eve', 'film follows', 'single mother', 'life story', 'small time', 'middle aged woman', 'aged woman', 'teenage girl', 'middle aged couple', 'aged couple', 'feature length']
Topic 31:
['film directed', 'documentary film', 'middle aged', 'short film', 'romantic comedy', 'best friends', 'based novel', 'comedy fi

In [76]:
for idx, topic in enumerate(svd.components_):
    print("Topic %d:" % (idx + 1))
    #print(topic)
    print([(cv_tfidf.get_feature_names()[i]) for i in topic.argsort()[:-30 - 1:-1]])

Topic 1:
['nan', 'kid', 'love', 'jay', 'hae', 'myung', 'lin', 'star', 'xiang', 'life', 'xiao', 'set', 'implores', 'hua', 'guy kibbee', 'railroaded', 'women', 'kibbee', 'monastery', 'sylvia', 'walters', 'blackpool', 'gary cooper', 'rackets', 'embark road', 'downhill', 'village', 'cohorts', 'hall', 'racketeer']
Topic 2:
['life', 'young', 'man', 'love', 'new', 'family', 'film', 'story', 'world', 'old', 'woman', 'father', 'year', 'time', 'girl', 'years', 'town', 'war', 'mother', 'wife', 'lives', 'school', 'finds', 'home', 'friends', 'son', 'day', 'year old', 'daughter', 'people']
Topic 3:
['overview', 'overview available', 'available', 'movie overview', 'movie', 'terrence', '2005', 'plot', 'new world', 'accomplishments', 'org', 'rock legends', 'punk', 'sweeping', 'making', 'ongoing', 'space', 'vile', 'add', 'punk rock', 'legends', 'humanity', 'activities', 'shocking', 'russell', 'planet', 'notorious', 'plans', 'rock', 'future']
Topic 4:
['film', 'war', 'documentary', 'world', 'story', 'wor

['los', 'angeles', 'los angeles', 'wife', 'friends', 'world', 'group', 'daughter', 'husband', 'years', 'murder', 'best', 'lives', 'team', 'set', 'teenage', 'death', 'son', 'help', 'young', 'girl', 'past', 'band', 'affair', 'drama', 'relationship', 'true', 'friend', 'men', 'town']
Topic 29:
['home', 'los', 'angeles', 'los angeles', 'friends', 'documentary', 'house', 'war', 'boy', 'story', 'way', 'returns', 'film', 'gets', 'soon', 'trip', 'car', 'true', 'finds', 'dead', 'new', 'girlfriend', 'returns home', 'road', 'make', 'college', 'time', 'night', 'begins', 'brother']
Topic 30:
['time', 'wife', 'father', 'women', 'killer', 'men', 'girl', 'friends', 'documentary', 'serial', 'serial killer', 'house', 'day', 'falls', 'family', 'village', 'small', 'son', 'big', 'falls love', 'small time', 'woman', 'century', 'mysterious', 'evil', 'save', 'takes', 'business', 'beautiful', 'young girl']
Topic 31:
['movie', 'brother', 'gang', 'group', 'home', 'men', 'wife', 'village', 'police', 'friends', 'bo

In [186]:
km = KMeans(n_clusters=300, init='k-means++', max_iter=450, n_jobs=-1,)
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=450,
    n_clusters=300, n_init=10, n_jobs=-1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [195]:
km2 = KMeans(n_clusters=500, init='k-means++', max_iter=450, n_jobs=-1)
km2.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=450,
    n_clusters=500, n_init=10, n_jobs=-1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [201]:
km3 = KMeans(n_clusters=150, init='k-means++', max_iter=450, n_jobs=-1, random_state=51)
km3.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=450,
    n_clusters=150, n_init=10, n_jobs=-1, precompute_distances='auto',
    random_state=51, tol=0.0001, verbose=0)

In [187]:
clustered = zip(km.labels_, movies_metadata_df['id'])#zip(tfidf_model_df.index.values, km.labels_)

In [188]:
#### Needs Functionalization/Abstraction
from collections import defaultdict
movie_summaries_clustered = defaultdict(list)

for cluster, movie_id in clustered:
    movie_summaries_clustered[cluster].append(movie_id)
####

In [112]:
#cluster_distribution = [len(movies) for (clust, movies) in movie_summaries_clustered.items()]

In [113]:
def get_cluster_number(movie, cluster_zip):

    for cluster, movie_id in cluster_zip:
        
        if movie_id == movie:
            return cluster
    
    raise Exception('Movie not found in cluster')

In [114]:
#### Find the cluster of a movie by movie id
clustered = zip(km.labels_, movies_metadata_df['id'])#zip(tfidf_model_df.index.values, km.labels_)
get_cluster_number(862, clustered)
####

18

In [115]:
def get_movie_name(movie_id):
    return movies_metadata_df[movies_metadata_df.id == movie_id]['original_title'].iloc[0]

def get_all_movies_in_cluster(cluster_number, cluster_dict):
    
    movies = cluster_dict[cluster_number]
    return [get_movie_name(mov) for mov in movies]

In [190]:
#### List all the movies in a cluster 
get_all_movies_in_cluster(9, movie_summaries_clustered)
####

['The Journey of August King',
 'Heidi Fleiss: Hollywood Madam',
 'Feast of July',
 'A Simple Twist of Fate',
 'De eso no se habla',
 'Beauty and the Beast',
 'Moll Flanders',
 'Tarantella',
 'Le Hussard sur le toit',
 'That Darn Cat!',
 'Raw Deal',
 'Identificazione di una donna',
 'Lethal Weapon',
 'Shadow of a Doubt',
 'Jamaica Inn',
 'Shattered Image',
 'The Theory of Flight',
 "The Mummy's Ghost",
 'The Bedroom Window',
 'Stigmata',
 "L'ennui",
 'Coma',
 'The Ape',
 'Mansfield Park',
 'The Crow: Salvation',
 'Possession',
 'Daughter of Dr. Jekyll',
 'The In Crowd',
 'The Gift',
 'Bread and Roses',
 'Another Woman',
 'The Vanishing',
 "Don't Say a Word",
 'Keetje Tippel',
 'Lásky jedné plavovlásky',
 'Scarlet Street',
 'Divorcing Jack',
 'Oh Heavenly Dog',
 'Homicidal',
 'The Mystic Masseur',
 'The Sleepy Time Gal',
 'Tmavomodrý svět',
 'Secretary',
 'May',
 'Starstruck',
 'César et Rosalie',
 'Éxtasis',
 'Le Divorce',
 'Dans ma peau',
 'Rabid',
 'The Black Pirate',
 'EuroTrip',
 '

## Trying Agglomorative Clustering

In [123]:
from sklearn.cluster import AgglomerativeClustering

In [177]:
ac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='cosine')

In [178]:
hacked_X = X + .000000000000000000000001
#for val in hacked_X:
#    print(val)

In [179]:
# 0:8:30 Train time with (n_clusters=5, linkage='average', affinity='cosine')
# Less than 0:40: Train time with (n_clusters=25, linkage='average', affinity='cosine')
ac.fit(hacked_X)

AgglomerativeClustering(affinity='cosine', compute_full_tree='auto',
            connectivity=None, linkage='average', memory=None,
            n_clusters=300, pooling_func='deprecated')

In [180]:
clustered_ac = zip(ac.labels_, movies_metadata_df['id'])
#### Needs Functionalization/Abstraction
from collections import defaultdict
movie_summaries_clustered = defaultdict(list)

for cluster, movie_id in clustered_ac:
    movie_summaries_clustered[cluster].append(movie_id)
####

In [185]:
#### List all the movies in a cluster 
get_all_movies_in_cluster(140, movie_summaries_clustered)
####

['Wolves',
 'Americathon',
 'Sweet Karma',
 'Reel Injun',
 'Snake River Desperadoes',
 'Ambush at Cimarron Pass',
 'Roar: Tigers of the Sundarbans',
 'Die Söhne der großen Bärin',
 'A Ghost in Monte Carlo',
 'Texas Rising']

## Comparing KMeans and Agglomoration

In [192]:
from sklearn.metrics import silhouette_score

In [199]:
print(f'Silhouette score for KMeans, n=300 is {silhouette_score(X, km.labels_)}')

Silhouette score for KMeans, n=300 is 0.26716533390124225


In [198]:
print(f'Silhouette score for AGClustering, type average, n=300 is {silhouette_score(X, ac.labels_)}')

Silhouette score for AGClustering, type average, n=300 is -0.3955892904699971


In [200]:
print(f'Silhouette score for KMeans, n=500 is {silhouette_score(X, km2.labels_)}')

Silhouette score for KMeans, n=500 is 0.23794972918128876


In [202]:
print(f'Silhouette score for KMeans, n=150 is {silhouette_score(X, km3.labels_)}')

Silhouette score for KMeans, n=150 is 0.5047943348736282
