In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
question_data = pd.read_csv("./data/train.csv")

In [None]:
question_data.head()

In [None]:
q_data_1 = question_data['question1']
q_data_2 = question_data['question2']
q_data = pd.concat([q_data_1, q_data_2], axis=0)

In [None]:
len(q_data)

In [None]:
q_data.dropna(axis=0, inplace=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, max_features = 100, stop_words='english')

X_train = vectorizer.fit_transform(q_data)

In [None]:
X_train.shape

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from time import time

In [None]:
t0 = time()
n_components = 4
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X_train)
print("done in %fs" % (time() - t0))

In [None]:
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))
print()

In [None]:
from sklearn.cluster import KMeans, MiniBatchKMeans
km = KMeans(n_clusters=4, init='k-means++', max_iter=100, n_init=1,
                verbose=1)

In [None]:
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X_train)
print("done in %0.3fs" % (time() - t0))

In [None]:
print("Top terms per cluster:")

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(4):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

In [None]:
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, n_iter=1000)
x_2d = tsne.fit_transform(X)

In [None]:
x_df = pd.DataFrame(x_2d, columns=['x', 'y'])
#x_df['token'] = model.wv.vocab.keys()

subset_df = x_df.sample(n=5000)

In [None]:
p = figure(plot_width=800, plot_height = 800)
_ = p.text(x=subset_df.x, y = subset_df.y, )

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
import string
import re
%matplotlib inline

In [None]:
nltk.download("punkt")

In [None]:
nltk.download("english")

In [None]:
q_data.values

In [None]:
#extract sentences from raw text data
sent_tokens = q_data.values.astype(str)
word_tokens = word_tokenize(sent_tokens[1]) #example code to extract words from sentences
word_tokens

In [None]:
sent_tokens[:10]

In [None]:
#remove stopwords 
stopwords = stopwords.words('english')

In [None]:
stopwords

In [None]:
[w.lower() for w in sent_tokens[4] if w not in stopwords] # convert to lower case for words that are not stopwords

In [None]:
#Caution!! we will not use it
#stemming housing and house are same words 
stemmer = PorterStemmer()
[stemmer.stem(w.lower()) for w in sent_tokens[4]]

In [None]:
#handle bi-gram collections - word that appears next to each other consider them as same and score
phrases = Phrases(sent_tokens)
bigram = Phraser(phrases)

In [None]:
bigram.phrasegrams 

In [None]:
["John lives in New York city".split()]

In [None]:
bigram["John lives in New York city".split()] #New York will appear together

In [None]:
#Pre process the corpus
lower_sents = []
for s in sent_tokens:
    lower_sents.append([w.lower() for w in s if w not in list(string.punctuation)])
lower_bigram = Phraser(Phrases(lower_sents))

In [None]:
lower_bigram = Phraser(Phrases(lower_sents, min_count = 32, threshold=64)) #try dfferent parameters min_count, threshold
lower_bigram.phrasegrams

In [None]:
#Clean up sentences
clean_sents = []
for s in lower_sents:
    clean_sents.append(lower_bigram[s])

In [None]:
len(clean_sents)

In [None]:
clean_sents[:10]

In [None]:
model = Word2Vec(sentences=clean_sents, size=64, sg=1, window=10, min_count=10, seed=42, workers=-1)

In [None]:
model.save("./data/clean_gutenberg_model.w2v")

In [None]:
model = gensim.models.Word2Vec.load("./data/clean_gutenberg_model.w2v")

In [None]:
len(model.wv.vocab)

In [None]:
import gensim
#load word2vec model, here GoogleNews is used
model = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)
#two sample sentences 
s1 = 'the first sentence'
s2 = 'the second text'

#calculate distance between two sentences using WMD algorithm
distance = model.wmdistance(s1, s2)

print ('distance = %.3f' % distance)

In [None]:
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
word_vectors = model.wv

In [None]:
word_vectors.save(fname)
word_vectors = KeyedVectors.load(fname)

In [None]:
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False)  # C text format
word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True)  # C binary format

In [None]:
word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])

In [None]:
word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

In [None]:
word_vectors.doesnt_match("breakfast cereal dinner lunch".split())

In [None]:
word_vectors.similarity('woman', 'man')

In [None]:
word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))

In [None]:
word_vectors.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))

In [None]:
q_data_1[:10]

In [None]:
q_data_2[:10]

In [None]:
def clean_sents(sent):
    clean_words = []
    words = word_tokenize(sent)
    for word in words:
        if word not in stopwords:
            if word not in list(string.punctuation):
                clean_words.append(word)
        clean_words = [re.sub("[^a-zA-Z]"," ", w) for w in clean_words]
    return clean_words

In [None]:
def clean_sents_1(sent):
    words = word_tokenize(sent)
    clean_words = [w.lower() for w in words if w not in stopwords if w not in list(string.punctuation)]
    clean_words = [re.sub("[^a-zA-Z]"," ", w) for w in clean_words]
    return clean_words

In [None]:
def avg_sentence_vector(words, model, num_features, index2word_set):
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [None]:
num_features = 100

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
question_data = pd.read_csv("./data/train.csv")

In [None]:
q_data_1 = question_data['question1']
q_data_2 = question_data['question2']
q_data = pd.concat([q_data_1, q_data_2], axis=0)

In [None]:
sents_1 = q_data_1.values.astype(str)
sents_2 = q_data_2.values.astype(str)

In [None]:
sents_1 = sents_1.tolist()
sents_2 = sents_2.tolist()

In [None]:
sents_1[:1]

In [None]:
for sent in sents_1:
    words = clean_sents_1(sent)
    featureVec = avg_sentence_vector(words, model, num_features, index2word_set)

In [None]:
#get average vector for sentence 1
sentence_1 = "this is sentence number one"
sentence_1_avg_vector = avg_feature_vector(sentence_1.split(), model=word2vec_model, num_features=100)

In [None]:
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(q_data.values, \
            size=100, min_count = 100, \
            window = 10, sample = 1e-3)

In [None]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
# numpy
import numpy
# random
from random import shuffle

In [None]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [None]:
sents = q_data.tolist()

In [None]:
print(len(sents))

In [1]:
def clean_sentences(sent):
    sent = [w.lower() for w in word_tokenize(sent) if w not in stopwords if w not in list(string.punctuation)]
    sent = [re.sub("[^a-zA-Z]"," ", w) for w in sent]
    return sent

In [None]:
clean_sents = []
for sent in sents:
    clean_sent = clean_sentences(sent)
    clean_sents.append(sent)
    print(clean_sent)

In [None]:
sources = {'train.csv': 'questions'}

In [None]:
sentences = LabeledLineSentence(sources)

In [None]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)
model.build_vocab(sentences.to_array())

In [None]:
for epoch in range(10):
    model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=model.iter)

In [None]:
model.save('./questions_sim.d2v')

In [None]:
doc_vectors = model.wv

In [None]:
from gensim.models import KeyedVectors
doc_vectors.save("questions_vec.wv")
doc_vectors = KeyedVectors.load("questions_vec.wv")

In [None]:
model.most_similar('good')

In [None]:
doc_vectors.most_similar(positive=['woman', 'king'], negative=['man'])

In [None]:
doc_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

In [None]:
doc_vectors.similarity('woman', 'man')

In [None]:
doc_vectors.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])

In [None]:
model['questions']

In [18]:
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re

def preprocess(sentence):
    sentence = sentence.lower()
    sentence = re.sub("[^a-zA-Z]"," ", sentence)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered_words)

sentence = "At 8 o'clock on Thursday morning Arthur didn't feel very good. French-Fries"
print(preprocess(sentence))

clock thursday morning arthur feel good french fries


In [14]:
import os
import numpy as np
import pandas as pd

In [15]:
question_data = pd.read_csv("./data/train.csv")

In [16]:
q_data_1 = question_data['question1']
q_data_2 = question_data['question2']
q_data = pd.concat([q_data_1, q_data_2], axis=0)

In [17]:
sents = q_data.tolist()

In [20]:
clean_sentences = []
for sent in sents:
    clean_sent = preprocess(sent)
    clean_sentences.append(clean_sent)

AttributeError: 'float' object has no attribute 'lower'