In [None]:
#nltk.download()
#Select stopwords corpora, punkt model, porter_stem model, averaged perceptron tagger
import nltk
import numpy
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
import re 
import pickle
import gensim
import difflib
from difflib import SequenceMatcher as sm
%matplotlib inline

In [None]:
#Read in data and check metadata
data = pd.read_csv("C:\\Users\sugac_000\Desktop\Insight Data Science\Dev-setups\Project_Title_Data.csv",encoding = 'ISO-8859-1')
data.info()

In [None]:
#View first 20 project titles
data['project'].head(20)

In [None]:
#Ensure project titles are strings that can be tokenized
projects = str(data['project'])

In [None]:
#Put lists of strings together into a single list
project_titles = []
for i in range (data.shape[0]):
    project_titles.append(data.project[i])
project_titles[:20]

In [None]:
#One way to tokenize; into a list of lists
tokens_1 = [nltk.word_tokenize(str(item)) for item in project_titles]
#flatten list of lists for frequency distribution
tokens = [item for sublist in tokens_1 for item in sublist]
tokens[:10]

Now each row is a comma-separated list of the individual words of a single project. May still need to combine into a single list of all projects

In [None]:
#What are the most common 'words'?
nltk.FreqDist(tokens).plot(20, cumulative=False)

Need to get rid of non alphabetic characters (e.g., :, !, ., '), as well as stopwords (e.g., a, the, and, on)

In [None]:
# Another, better way, to tokenize
def text_preprocess(title): # perform tokenization, select noun, Lemmatization etc on a line text

    rtext=[] # to collect all tokens   
    for w, tag in nltk.pos_tag(nltk.word_tokenize(title.lower())):  # Tokenization of lowercased words
        if tag in ['NN','NNP','NNS','VB','VBD','VBG','VBN','VBP','VBZ']:  # Keep only Nouns(project topics) and Verbs(project purpose)
            rtext.append(w)
     
    wordnet_lemmatizer = WordNetLemmatizer()  # Lemmatize
    rtext = [wordnet_lemmatizer.lemmatize(w) for w in rtext]
    
    stemmer=PorterStemmer() # Stem
    rtext = [stemmer.stem(w) for w in rtext]
            
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    rtext = [w for w in rtext if not w in stop_words]
    
    return rtext

texts = []
for i in range (0,len(tokens)-1):
    texts.append(text_preprocess(tokens[i]))
       
texts[:10]

In [None]:
tokens = [item for sublist in texts for item in sublist]
nltk.FreqDist(tokens).plot(20, cumulative=False)

In [None]:
# top term frequencies
pd.value_counts(tokens)[:50]
#1704 total terms

In [None]:
from gensim import corpora
from gensim import models

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]


In [None]:
def TFIDFModels(corpus_list):
  tfidf = models.TfidfModel(corpus_list)
  tfidf_corpus = tfidf[corpus_list]
  return tfidf,tfidf_corpus

#Create a bag of words from a list of text 
def GetVectors(evt_list,max_features=500):
  vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = max_features) 
  in_features = vectorizer.fit_transform(evt_list)
  in_features = in_features.toarray()
  vocab = vectorizer.get_feature_names()
  vocab = numpy.array(vocab)
  return in_features,vocab

TFIDFModels(corpus)

In [None]:
with open('clean_proj.txt','wb') as file1:
    pickle.dump(texts,file1,protocol=2)

cleaneddoc = pd.read_pickle('clean_proj.txt')

model = gensim.models.Word2Vec(cleaneddoc, size=100, window=6, min_count=3, workers=1)
model.save('W2Vmodel')

model = gensim.models.Word2Vec.load('W2Vmodel')

In [None]:
# Print sorted vocabulary
sorted(model.wv.vocab)

In [None]:
# cosine similarity within titles
model.wv.similarity('sentiment','analysi')

In [None]:
# cosine similarity using numpy yields same result
numpy.dot(model.wv['sentiment'], model.wv['analysi'])/(numpy.linalg.norm(model.wv['sentiment'])* numpy.linalg.norm(model.wv['analysi']))

In [None]:
model.wv.similarity('get','help')

In [None]:
model.wv.most_similar('find')

In [None]:
model.wv.most_similar('news')

In [None]:
model.wv.most_similar('twitter')

In [None]:
model.wv.most_similar('beer')

In [None]:
model.wv.similar_by_word('nyc')

In [None]:
import numpy as np
import bokeh
from sklearn.manifold import TSNE
vocabulary = sorted(model.wv.vocab)
emb_tuple = tuple([model[v] for v in vocabulary])
X = np.vstack(emb_tuple)

X_embedded = TSNE(n_components=2, init='pca', random_state=0).fit_transform(X)

from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label

source = ColumnDataSource(data=dict(x=list(X_embedded[:, 0]),
                                    y=list(X_embedded[:, 1]),
                                    words= vocabulary))

p = figure(title='Word2Vec tSNE')
p.scatter(x='x', y='y', size=2, source=source)
labels = LabelSet(x='x', y='y', text='words', level='glyph',x_offset=5, y_offset=5, source=source, render_mode='canvas')
p.add_layout(labels)

show(p)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.scatter(X_embedded[:, 0], X_embedded[:, 1])
for i, txt in enumerate(vocabulary):
    ax.annotate(txt, (X_embedded[i, 0],X_embedded[i, 1]))
plt.show()

In [None]:
vocabulary

In [None]:
100*(1-model.wv.similarity('machin','learn'))

In [None]:
100*(1-model.wv.similarity('yelp','review'))

In [None]:
100*(1-abs(model.wv.similarity('review','yelp')))

In [None]:
100*(1-abs(model.wv.similarity('travel','delay')))

In [None]:
difflib.get_close_matches(project_titles[3], project_titles[1:100])

In [None]:
sm(project_titles[63],project_titles[164]).ratio()

In [None]:
for word in tokens_1[63]:
    print ("score for: " + "beer" + " vs. " + word + " = " + str(sm(None, "beer", word).ratio()))

In [None]:
for word in tokens_1[63]:
    if sm(None,"recommend",word).ratio() > 0.25:
        print ("score for: " + "recommend" + " vs. " + word + " = " + str(sm(None, "recommend", word).ratio()))

In [None]:
find_match = [[print("score for: " + "find" + " vs. " + word + " = " + str(sm(None, "find", word).ratio())) for word in tokens_1[i]  if sm(None,"find", word).ratio() > 0.70] for i in range (0,len(tokens_1)-1)]

In [None]:
rec_match = [[print("score for: " + "recommend" + " vs. " + word + " = " + str(sm(None, "recommend", word).ratio())) for word in tokens_1[i]  if sm(None,"recommend", word).ratio() > 0.66] for i in range (0,len(tokens_1)-1)]

In [None]:
twitter_match = [[print("score for: " + "twitter" + " vs. " + word + " = " + str(sm(None, "twitter", word).ratio())) for word in tokens_1[i]  if sm(None,"twitter", word).ratio() > 0.66] for i in range (0,len(tokens_1)-1)]

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import KMeans
def clusterTitlesKmeans(titles):
    taggeddocs   = []
    tag2titlemap = {}
    for index, i in enumerate(titles):
        if len(i) > 2:  # Non empty titles
            tag = u'SENT_{:d}'.format(index)
            sentence = TaggedDocument(
                words=gensim.utils.to_unicode(i).split(), tags=[tag])
            tag2titlemap[tag] = i
            taggeddocs.append(sentence)

    model = Doc2Vec(
        taggeddocs, dm=0, alpha=0.025, size=20, min_alpha=0.025, min_count=0)
    print (" ")
    for epoch in range(60):
        model.train(
            taggeddocs, total_examples=model.corpus_count, epochs=model.iter)
        model.alpha -= 0.002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no decay

    dataSet = model.docvecs.doctag_syn0  
    kmeansClustering = KMeans(n_clusters=10)
    centroidIndx = kmeansClustering.fit_predict(dataSet)
    def remove_non_ascii(text):
        return ''.join(i for i in text if ord(i) < 128)
    topic2wordsmap = {}
    for i, val in enumerate(dataSet):
        tag = model.docvecs.index_to_doctag(i)
        topic = centroidIndx[i]
        if topic in topic2wordsmap.keys():
            for w in (tag2titlemap[tag].split()):
                topic2wordsmap[topic].append(w)
        else:
            topic2wordsmap[topic] = []
    for i in topic2wordsmap:
        print("Topic {} has words: {}".format(i + 1, ' '.join(remove_non_ascii(word) for word in topic2wordsmap[i][:20])))

In [None]:
titles = [[i.lower() for i in w] for w in tokens_1 if w not in stopwords.words('english') and len(w) > 2 ]
titles_1 = [[bytes(str(i),'utf-8') for i in w] for w in titles]
clusterTitlesKmeans(tokens)

In [None]:
def noun_preprocess(title): # perform tokenization, select noun, Lemmatization etc on a line text

    rtext=[] # to collect all tokens   
    for w, tag in nltk.pos_tag(nltk.word_tokenize(title.lower())):  # Tokenization of lowercased words
        if tag in ['NN','NNS']:  # Keep only Nouns(project topics) 
            rtext.append(w)
     
    #wordnet_lemmatizer = WordNetLemmatizer()  # Lemmatize
    #rtext = [wordnet_lemmatizer.lemmatize(w) for w in rtext]  
    
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    rtext = [w for w in rtext if not w in stop_words]
    
    stemmer=PorterStemmer() # Stem
    rtext = [stemmer.stem(w) for w in rtext]
    
    return rtext

nouns_only = []
for i in range (0,len(tokens)-1):
    nouns_only.append(noun_preprocess(tokens[i]))
       
noun_tokens = [item for sublist in nouns_only for item in sublist]
nltk.FreqDist(noun_tokens).plot(20, cumulative=False)

In [None]:
clusterTitlesKmeans(noun_tokens)

In [None]:
def verb_preprocess(title): # perform tokenization, select noun, Lemmatization etc on a line text

    rtext=[] # to collect all tokens   
    for w, tag in nltk.pos_tag(nltk.word_tokenize(title.lower())):  # Tokenization of lowercased words
        if tag in ['VB','VBD','VBG','VBN','VBP','VBZ']:  # Keep only Verbs(project actions) 
            rtext.append(w)
     
    #wordnet_lemmatizer = WordNetLemmatizer()  # Lemmatize
    #rtext = [wordnet_lemmatizer.lemmatize(w) for w in rtext]  
    
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    rtext = [w for w in rtext if not w in stop_words]
    
    stemmer=PorterStemmer() # Stem
    rtext = [stemmer.stem(w) for w in rtext]
    
    return rtext

verbs_only = []
for i in range (0,len(tokens)-1):
    verbs_only.append(verb_preprocess(tokens[i]))
       
verb_tokens = [item for sublist in verbs_only for item in sublist]
nltk.FreqDist(verb_tokens).plot(20, cumulative=False)

In [None]:
clusterTitlesKmeans(verb_tokens)

In [None]:
verb_tokens

In [None]:
def tagged_docs(titles):
    taggeddocs   = []
    tag2titlemap = {}
    for index, i in enumerate(titles):
        if len(i) > 2:  # Non empty titles
            tag = u'SENT_{:d}'.format(index)
            sentence = TaggedDocument(words=gensim.utils.to_unicode(i).split(), tags=[tag])
            tag2titlemap[tag] = i
            taggeddocs.append(sentence)
    return taggeddocs
            
tagged_docs(project_titles)

In [None]:
#top bi-grams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens, 6)
finder.apply_freq_filter(3)
print(finder.nbest(bigram_measures.likelihood_ratio, 10))

In [None]:
# LDA
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized
texts = [text for text in project_titles if len(text) > 2]
doc_clean = [clean(doc).split() for doc in texts]
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
ldamodel = models.ldamodel.LdaModel(doc_term_matrix, num_topics=8, id2word = 
dictionary, passes=3)
for topic in ldamodel.show_topics(num_topics=8, formatted=False, num_words=4):
    print("Topic {}: Words: ".format(topic[0]))
    topicwords = [w for (w, val) in topic[1]]
    print(topicwords)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
     return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
def cos_similarity(textlist):
    tfidf = TfidfVec.fit_transform(textlist)
    return (tfidf * tfidf.T).toarray()
cos_similarity(tokens)

In [None]:
cos_similarity(tokens).shape

In [None]:
plt.imshow(cos_similarity(tokens), cmap='hot', interpolation='nearest')
plt.show()

In [None]:
cos_similarity(tokens)[:45]

In [None]:
model.wv.similarity("machin","learn")*model.wv.similarity("recommend","engin")