In [1]:
from dataframe_handling import create_df
import numpy as np
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [2]:
df= create_df()

In [3]:
df.category_name[:10]

0      outdoors/adventure
1         career/business
2                 fitness
3        health/wellbeing
4                    LGBT
5        health/wellbeing
6                   music
7              book clubs
8    new age/spirituality
9         career/business
Name: category_name, dtype: object

In [4]:
df.description[:10]

0    World Traveling photographers is a group for n...
1    Do you work at or with San Francisco Bay Area ...
2    Voted BEST RUNNING CLUB in the Bay AREA Go WOW...
3    What is supportive of a healthy body, mind, he...
4    The purpose ofSan Francisco Walking Dykes grou...
5    Coaches inspire our clients to be the best the...
6    1) Instant Chorus gets you singing a cappella ...
7    Greetings. There is hardly anything I enjoy mo...
8    Explore your true potential through harnessing...
9    What is Fore Us? Fore Us is a loosely based or...
Name: description, dtype: object

In [5]:
df.sponsor_details[2]

[u'San Francisco A List Voted Go WOW Team Best Running Club in the Bay Area',
 u'Stephanie K Atwood Enterprises is the dream business of Founder Stephanie Atwood, offering lifelong fitness to women everywhere through running, walking, and the outdoors. WOW offers all the pieces to all women, all sizes, all shapes, all abilities, to pursue their "athlete within" and discover their true potential. Through neighborhood groups, live support calls, and the 15 minute learning series, you have access to train and grow wherever you live. Join us soon!',
 u'The See Jane Run Half Marathon and 5K are full of chocolate, champagne, and women who can call themselves athletes.',
 u'Train with WOW, get a registration discount for the ORF.',
 nan,
 u"Use the gymboss for many different workouts and times. It's great!"]

In [6]:
additional_stopwords = {'com','www'}

In [7]:
count_vectorizer = CountVectorizer(analyzer='word',
                                  ngram_range=(1, 3), min_df=2,
        stop_words=text.ENGLISH_STOP_WORDS.union(additional_stopwords),
                                  token_pattern='\\b[a-z][a-z]+\\b')

In [8]:
count_vectorizer.fit(df.description.dropna())

CountVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 3), preprocessor=None,
        stop_words=frozenset(['all', 'show', 'anyway', 'four', 'go', 'mill', 'find', 'seemed', 'whose', 're', 'herself', 'whoever', 'behind', 'should', 'to', 'only', 'under', 'herein', 'do', 'his', 'get', 'very', 'de', 'myself', 'cannot', 'every', 'yourselves', 'him', 'is', 'cry', 'beforehand', 'these', 'sh...ho', 'most', 'eight', 'but', 'nothing', 'why', 'noone', 'sometimes', 'together', 'serious', 'once']),
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
count_vectorizer.vocabulary_

{u'dance clubs': 17292,
 u'process successful': 67369,
 u'related apache apex': 70902,
 u'vote favorite': 88784,
 u'group dedicated building': 33898,
 u'inevitably': 40856,
 u'physically emotionally': 64537,
 u'today public speaking': 84921,
 u'love create': 50373,
 u'travel club': 85778,
 u'south north': 78444,
 u'need girls night': 57772,
 u'share similar interests': 75960,
 u'fitness goals join': 28471,
 u'think friends': 84040,
 u'socials hiking': 77952,
 u'groups yahoo': 35456,
 u'teach different': 82744,
 u'buddies starting': 8438,
 u'meet general': 52893,
 u'positive good': 65941,
 u'valued key': 87807,
 u'hang soon': 36112,
 u'books random': 7580,
 u'potential future': 66221,
 u'welcomed': 90777,
 u'case haven noticed': 10095,
 u'activating': 564,
 u'ancient spiritual teaching': 2339,
 u'welcomes': 90783,
 u'fit': 28423,
 u'fix': 28511,
 u'french spanish english': 30291,
 u'fin': 28233,
 u'performed singing artists': 63899,
 u'clara san': 11697,
 u'open minded individuals': 604

In [10]:
ng_vecs = count_vectorizer.transform(df.description.dropna()).transpose()
vec_sum =ng_vecs.sum(axis=1)

In [11]:
vec_sum.sort()
vec_sum[100:110]

matrix([[2],
        [6],
        [2],
        [2],
        [2],
        [5],
        [2],
        [3],
        [2],
        [2]])

In [12]:
ng_vecs.shape

(93410, 7943)

In [13]:
corpus = matutils.Sparse2Corpus(ng_vecs)

In [14]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.iteritems())

In [15]:
tfidf = models.TfidfModel(corpus)

In [16]:
tfidf_corpus = tfidf[corpus]


In [17]:
lsi = models.LsiModel(tfidf_corpus, id2word=id2word, num_topics=100)

In [18]:
lsi_corpus = lsi[tfidf_corpus]

In [19]:
doc_vecs = [doc for doc in lsi_corpus]


In [20]:
index = similarities.MatrixSimilarity(doc_vecs, num_features=100)

In [21]:
print index[doc_vecs[110]]
print len(index[doc_vecs[110]])
doc_vecs[110][:10]

[ 0.02679681  0.19204032  0.11004034 ...,  0.06610078  0.02541695
  0.13815252]
7943


[(0, 0.10239127243104255),
 (1, 0.046277767734621049),
 (2, 0.024797415193888277),
 (3, -0.02243261636438711),
 (4, 0.036066024730857862),
 (5, 0.026339351177270605),
 (6, 0.064345097227943682),
 (7, 0.00011767557847207222),
 (8, -0.0193537957382677),
 (9, -0.015357105209784649)]

In [22]:
sims = sorted(enumerate(index[doc_vecs[3540]]), key=lambda item: -item[1])

In [23]:
sims

[(3540, 1.0),
 (5886, 0.74949491),
 (6214, 0.74158317),
 (1902, 0.73562509),
 (4547, 0.70576805),
 (2619, 0.69874811),
 (5941, 0.69072604),
 (3505, 0.66864526),
 (4044, 0.667557),
 (7501, 0.63658208),
 (2248, 0.62593162),
 (2328, 0.62223649),
 (3370, 0.60797292),
 (4820, 0.60276961),
 (294, 0.59987265),
 (5639, 0.58530587),
 (156, 0.57479513),
 (3480, 0.57382393),
 (3258, 0.56912363),
 (2029, 0.56213784),
 (1617, 0.55356002),
 (1984, 0.552688),
 (2995, 0.54551268),
 (1285, 0.53526312),
 (3368, 0.53307885),
 (731, 0.5323602),
 (3416, 0.53090608),
 (5912, 0.52927214),
 (1850, 0.52701712),
 (3274, 0.52430987),
 (5991, 0.51691288),
 (5806, 0.51467198),
 (2493, 0.50105143),
 (5080, 0.50052553),
 (5524, 0.49706683),
 (3314, 0.49123576),
 (596, 0.49008206),
 (5136, 0.48736152),
 (3155, 0.48636693),
 (1253, 0.47671115),
 (1462, 0.4727338),
 (1107, 0.46931195),
 (7635, 0.46741492),
 (441, 0.46501493),
 (61, 0.46277794),
 (7031, 0.45925155),
 (6518, 0.45832905),
 (1648, 0.45832041),
 (1668, 0.45

In [24]:
todas =[]
for sim in sims:
    todas.append(sim[0])


In [25]:
for i in todas: 
    print i,df.category_name[i]

3540 tech
5886 tech
6214 tech
1902 sports/recreation
4547 education/learning
2619 sports/recreation
5941 food/drink
3505 book clubs
4044 tech
7501 fitness
2248 sports/recreation
2328 sports/recreation
3370 sports/recreation
4820 tech
294 socializing
5639 career/business
156 movies/film
3480 parents/family
3258 pets/animals
2029 cars/motorcycles
1617 religion/beliefs
1984 socializing
2995 career/business
1285 sci-fi/fantasy
3368 parents/family
731 sports/recreation
3416 games
5912 career/business
1850 movements/politics
3274 socializing
5991 new age/spirituality
5806 food/drink
2493 health/wellbeing
5080 book clubs
5524 tech
3314 games
596 alternative lifestyle
5136 community/environment
3155 language/ethnic identity
1253 food/drink
1462 tech
1107 sports/recreation
7635 dancing
441 tech
61 fitness
7031 fashion/beauty
6518 career/business
1648 cars/motorcycles
1668 career/business
4582 parents/family
3071 movies/film
3425 socializing
3549 tech
458 outdoors/adventure
5840 music
703 movies

In [26]:
df.category_name[456]

u'movements/politics'

## Checking For Metis

In [27]:
texto=['Metis accelerates the careers of data scientists by providing full-time immersive bootcamps,\
evening professional development courses, online training and corporate programs.Train you to think and act like a data scientist.\
Teach you the most essential skills and technologies. Immerse you in real-world, complex problems.Create opportunities to connect with prospective employers.\
Provide you with excellent student support.Inject continual fun, passion, and excitement into your experience at Metis.',
      'big data startups']

texto_galvanize=['Galvanize is where you can become a developer, data scientist, data engineer, or build your tech startup.']

In [28]:
#len(texto.split(' '))

In [29]:
np.array(texto)

array([ 'Metis accelerates the careers of data scientists by providing full-time immersive bootcamps,evening professional development courses, online training and corporate programs.Train you to think and act like a data scientist.Teach you the most essential skills and technologies. Immerse you in real-world, complex problems.Create opportunities to connect with prospective employers.Provide you with excellent student support.Inject continual fun, passion, and excitement into your experience at Metis.',
       'big data startups'], 
      dtype='|S498')

In [30]:
#metis_counts = count_vectorizer.fit(texto)
metis_vecs = count_vectorizer.transform(np.array(texto)).transpose()

In [None]:
import pickle
pickle.dump(count_vectorizer,open( 'count_vectorizer.pkl', "wb" ))

In [None]:
print type(metis_vecs)
print metis_vecs.shape


In [None]:
metis_corpus= matutils.Sparse2Corpus(metis_vecs)

In [None]:
count = 0 
while count<1:
    for i in metis_corpus:
        print i
        count+=1

In [None]:
metis_tfidf_corpus = tfidf[metis_corpus]

In [None]:
metis_lsi_corpus = lsi[metis_tfidf_corpus]

In [None]:
metis_doc_vecs = [doc for doc in metis_lsi_corpus]

In [None]:
#metis_index = similarities.MatrixSimilarity(metis_doc_vecs, num_features=100)

In [None]:
#sims = sorted(enumerate(index[metis_doc_vecs[0]]), key=lambda item: -item[1])
index.num_best=3000
my_index=index[metis_doc_vecs[0]]

In [None]:
my_index

In [None]:
doc_lengths=[]
for i in my_index:
    try:
        doc = df.description[i[0]].split(' ')
        doc_lengths.append(len(doc))
    except AttributeError:
        continue



In [None]:
plt.plot(doc_lengths)

In [None]:
_indices=[]
for i in my_index: 
    _indices.append(df.description[i[0]])
    print i,df.category_name[i[0]]
    

In [None]:
df.description.describe()

In [None]:
top_docs=pd.Series(_indices)

In [None]:
top_docs.shape

In [None]:
top_docs.dropna().shape

In [None]:
top_docs[top_docs.isnull()]

In [None]:
top_docs[19]