In [88]:
# from: https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_clustering_wikipedia_sections.py

from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import sklearn
import time
import spacy
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import gensim
from nltk.corpus import stopwords
from nltk import download
import string
import numpy as np

In [185]:
nlp = spacy.load('en_core_web_md')
#nlp = spacy.load('en_core_web_lg')

In [74]:
#embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')
#embedder = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
embedder = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
#embedder = SentenceTransformer('roberta-base-nli-mean-tokens')

In [8]:
#Sentences and sections are from Wikipeda.
#Source: https://en.wikipedia.org/wiki/Bushnell,_Illinois
corpus = [
("Bushnell is located at 40°33′6″N 90°30′29″W (40.551667, -90.507921).", "Geography"),
("According to the 2010 census, Bushnell has a total area of 2.138 square miles (5.54 km2), of which 2.13 square miles (5.52 km2) (or 99.63%) is land and 0.008 square miles (0.02 km2) (or 0.37%) is water.", "Geography"),

("The town was founded in 1854 when the Northern Cross Railroad built a line through the area.", "History"),
("Nehemiah Bushnell was the President of the Railroad, and townspeople honored him by naming their community after him. ", "History"),
("Bushnell was also served by the Toledo, Peoria and Western Railway, now the Keokuk Junction Railway.", "History"),

("As of the census[6] of 2000, there were 3,221 people, 1,323 households, and 889 families residing in the city. ", "Demographics"),
("The population density was 1,573.9 people per square mile (606.7/km²).", "Demographics"),
("There were 1,446 housing units at an average density of 706.6 per square mile (272.3/km²).", "Demographics"),

("From 1991 to 2012, Bushnell was home to one of the largest Christian Music and Arts festivals in the world, known as the Cornerstone Festival.", "Music"),
("Each year around the 4th of July, 25,000 people from all over the world would descend on the small farm town to watch over 300 bands, authors and artists perform at the Cornerstone Farm Campgrounds.", "Music"),
("The festival was generally well received by locals, and businesses in the area would typically put up signs welcoming festival-goers to their town.", "Music"),
("As a result of the location of the music festival, numerous live albums and videos have been recorded or filmed in Bushnell, including the annual Cornerstone Festival DVD. ", "Music"),
("Cornerstone held its final festival in 2012 and no longer operates.", "Music"),

("Beginning in 1908, the Truman Pioneer Stud Farm in Bushnell was home to one of the largest horse shows in the Midwest.", "Horse show"),
("The show was well known for imported European horses.", "Horse show"),
("The Bushnell Horse Show features some of the best Belgian and Percheron hitches in the country. Teams have come from many different states and Canada to compete.", "Horse show"),
]

sentences = [row[0] for row in corpus]

corpus_embeddings = embedder.encode(sentences)
num_clusters = len(set([row[1] for row in corpus]))

#Sklearn clustering
km = AgglomerativeClustering(n_clusters=num_clusters)
km.fit(corpus_embeddings)

cluster_assignment = km.labels_


clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    for row in cluster:
        print("(Gold label: {}) - {}".format(row[1], row[0]))
    print("")

Cluster  1
(Gold label: History) - Nehemiah Bushnell was the President of the Railroad, and townspeople honored him by naming their community after him. 
(Gold label: History) - Bushnell was also served by the Toledo, Peoria and Western Railway, now the Keokuk Junction Railway.
(Gold label: Music) - From 1991 to 2012, Bushnell was home to one of the largest Christian Music and Arts festivals in the world, known as the Cornerstone Festival.
(Gold label: Music) - As a result of the location of the music festival, numerous live albums and videos have been recorded or filmed in Bushnell, including the annual Cornerstone Festival DVD. 
(Gold label: Horse show) - Beginning in 1908, the Truman Pioneer Stud Farm in Bushnell was home to one of the largest horse shows in the Midwest.
(Gold label: Horse show) - The show was well known for imported European horses.
(Gold label: Horse show) - The Bushnell Horse Show features some of the best Belgian and Percheron hitches in the country. Teams have 

In [35]:
startTime = time.time()
corpus_embeddings_ = [embedder.encode([sent]) for sent in sentences+sentences+sentences+sentences]
print(time.time()-startTime)

6.3147993087768555


In [36]:
startTime = time.time()
corpus_embeddings__ = embedder.encode(sentences+sentences+sentences+sentences)
print(time.time()-startTime)

5.75470495223999


In [21]:
corpus_embeddings_[0]

[array([-0.40179282,  0.03970386, -0.10575875, ...,  0.1947043 ,
        -1.2919564 , -1.3358579 ], dtype=float32)]

In [22]:
corpus_embeddings__[0]

array([-0.40179282,  0.03970386, -0.10575875, ...,  0.1947043 ,
       -1.2919564 , -1.3358579 ], dtype=float32)

In [15]:
sklearn.metrics.pairwise.cosine_similarity(corpus_embeddings__)

array([[ 0.9999999 ,  0.4309681 ,  0.31173187,  0.31466055,  0.4379264 ,
         0.1998738 ,  0.19828294,  0.3371287 ,  0.17922577,  0.10542643,
        -0.06541459,  0.2577943 ,  0.01437714,  0.16060522,  0.06156353,
         0.21282719],
       [ 0.4309681 ,  1.        ,  0.14881545,  0.19983399,  0.13176046,
         0.23285674,  0.24749455,  0.25302416,  0.15744069,  0.11684485,
        -0.01747227,  0.20021658,  0.05293076,  0.19706544,  0.15017726,
         0.13953333],
       [ 0.31173187,  0.14881545,  0.9999999 ,  0.27371246,  0.28655875,
         0.09872115,  0.01789186,  0.17102788,  0.19197434,  0.14834166,
         0.10122602,  0.12906836,  0.04853133,  0.21148062,  0.13546549,
         0.17851762],
       [ 0.31466055,  0.19983399,  0.27371246,  1.0000001 ,  0.5022584 ,
         0.01406725,  0.09569578,  0.05126666,  0.3792192 ,  0.21086393,
         0.27564824,  0.37054804,  0.03803694,  0.39375246,  0.27422008,
         0.30720428],
       [ 0.4379264 ,  0.13176046,  0

In [16]:
sklearn.metrics.pairwise.cosine_similarity(corpus_embeddings_)

ValueError: setting an array element with a sequence.

In [17]:
corpus_embeddings_

[[array([-1.3242445 , -0.64933014,  0.8323693 , ...,  0.48114544,
          0.22014539,  0.24143706], dtype=float32),
  array([-0.05739345, -1.3319354 , -0.16669667, ..., -0.38203692,
          1.0114957 , -0.6626356 ], dtype=float32),
  array([ 0.63747954, -0.4723857 , -0.16881032, ...,  0.2664583 ,
         -0.05793906,  0.62554336], dtype=float32),
  array([-0.84563553, -0.9238784 , -0.46759927, ...,  0.08496933,
          0.4714247 ,  1.0036556 ], dtype=float32),
  array([-1.0634193 , -0.79387444, -0.48994142, ..., -0.6941012 ,
          1.1619682 ,  1.326595  ], dtype=float32),
  array([ 1.0586462 , -0.1528958 , -0.3253127 , ..., -0.53674155,
          0.12935603,  0.46072125], dtype=float32),
  array([-0.37856936,  0.9286274 , -0.4845695 , ...,  0.18869597,
         -0.29873446, -0.11265919], dtype=float32),
  array([-0.37856936,  0.9286274 , -0.4845695 , ...,  0.18869597,
         -0.29873446, -0.11265919], dtype=float32),
  array([-0.4258077 , -0.1729173 , -0.46530622, ..., -0.

In [18]:
corpus_embeddings__

[array([-0.40179282,  0.03970386, -0.10575875, ...,  0.1947043 ,
        -1.2919564 , -1.3358579 ], dtype=float32),
 array([ 0.7080846 , -1.0281892 ,  0.34119478, ...,  0.24207094,
        -0.68438125, -0.95924544], dtype=float32),
 array([ 0.73700804,  0.34182876, -0.22426714, ..., -0.60452324,
        -1.0443708 , -0.1647631 ], dtype=float32),
 array([-0.9437948 , -0.618785  , -0.27663583, ..., -0.6655645 ,
        -0.77782977,  1.1266838 ], dtype=float32),
 array([ 0.54013115, -0.49721235, -0.29326084, ..., -0.81509495,
         0.09365091,  0.5037744 ], dtype=float32),
 array([ 0.8987189 , -0.728792  , -0.79590696, ..., -0.00441063,
        -0.7024551 ,  0.5912858 ], dtype=float32),
 array([ 0.5421814 , -0.05501065, -0.48665968, ...,  0.173589  ,
         0.59103554, -0.7315302 ], dtype=float32),
 array([ 0.9519559 , -0.37827626, -0.4089003 , ...,  0.4124198 ,
        -0.9095802 , -0.08201728], dtype=float32),
 array([ 0.06140995,  1.4088616 ,  0.06943528, ..., -0.29875275,
       

In [58]:
d = "ONONDAGA NATION _ Irving Powless Jr., the 69-year-old chief of the Onondaga Indian Nation, longs to see the day when his people enjoy economic stability and independence _ and not from the revenues of a casino or a bingo hall.    Rather, Powless, whose tribe opposes gambling, hopes that a settlement with the state of New York over ancient land claims will bring his brethren the assets they desperately need to upgrade their 7,300-acre reservation south of Syracuse.    ``We have lived in this area for 1,200 years,'' said Powless, a robust man who recalls with pride when, a generation ago, he competed against football great Jim Brown in a lacrosse match and knocked the powerful All-American and Hall of Fame fullback off his feet. ``There was fresh water, fresh air, no pollution.''    Today, the Onondagas are neither financially wealthy nor land rich. Pollution, from a dump and from a petroleum spill at a cigarette shop, has fouled the Indian territory.    ``We're considered poor,'' Powless said. ``How do you measure that? ... How many homeless people do you have in your city? We have zero.''    Members of the tribe have a right to live on the reservation, but many survive on low-wage jobs and welfare _ the consequence of a people ``living in abject poverty'' for 200 years, according to Robert Coulter, a lawyer with the Indian Law Resource Center, a nonprofit group that is helping the Onondagas on their proposed claim.    The Onondagas haven't yet gone to court to stake their land claim, but the nation is building a case against the state of New York that would follow suits filed by other Iroquois nations _ the Cayuga, Mohawk, Oneida and Seneca _ with the federal government as co-plaintiff. Several Indian law and history experts believe the Onondaga case would be very strong.    The tribe has identified a 10-mile-square section of Onondaga County as an area to which it lays claim. The land includes a mile surrounding Onondaga Lake and most of Syracuse, a city of 163,000 in upstate New York. That represents a fraction of the acreage that the Onondagas controlled more than two centuries ago.    Powless said he has been hearing about land claims since he was a boy growing up on the reservation, where his father, Irving Powless Sr., also served as chief. Powless views the reservation as an island representing the hundreds of thousands of acres wrongfully taken by and poorly used by ``the Europeans'' who obtained Indian properties illegally. He smiles as he talks about his boyhood on the Onondaga territory, where he learned to fish, hunt and play sports.    One of the top three officers among the 14 chiefs of the Onondagas, Powless has proven to have a tough side to him. And some Onondaga members feel Powless is too powerful.    ``No one can have anything unless you go through him,'' says Alice Thompson, 64, who lives on the reservation.    Thompson, according to Powless, was part of an insurgent group that wanted to overthrow his government.    A land-claim settlement would be a way to ease some of the tensions that remain. It could pump up the tribal government's popularity, pay for toxic cleanups and help the Onondaga government afford better educational, cultural and social programs. Powless said the Onondaga people want to work with the community outside the reservation to improve the economy of the region, perhaps creating tourism destinations that might include Indian culture, or setting up a free-trade zone at unused manufacturing sites.    The Onondagas say they don't intend to threaten eviction of property owners in whatever suit they file, unlike the Cayugas and Oneidas. But they have not ruled out governmental and commercial property holders as targets for eviction.    ``Maybe they should be subjected to some of the things we were subjected to _ such as taking your land away from you for 200 years,'' Powless said."
startTime = time.time()
for i in range(50):
    doc = nlp(d)
    print(len(doc))
print(time.time() - startTime)

779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
5.294466257095337


In [59]:
ds = [d for i in range(50)]
startTime = time.time()
for doc in nlp.pipe(ds, batch_size=100):
    print(len(doc))
print(time.time() - startTime)

779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
779
4.4055187702178955


In [76]:
startTime = time.time()
sentences = []
for i in range(10):
    d = "ETHETE, Wyo. _ In the face of cutbacks brought about by the overhaul of welfare programs, members of the Northern Arapaho Tribe on the Wind River Reservation have started a seven-acre community garden with donated land, seeds and equipment to grow vegetables for themselves and for the elderly and disabled who cannot work.    ``We were concerned that when time runs out, when they are no longer eligible for government assistance, what are they going to do for food?'' said Glen Revere, a nutritionist with the Indian Health Services on the 2.8 million-acre Wind River Reservation, about 100 miles east of Jackson, Wyo. ``Then we came up with the idea for this community garden, and it's been bigger than we ever expected in so many ways.''    At the all-volunteer garden, this is the time of year when potatoes grow larger than a man's fist, popping out of the soil, and five-foot-high cornstalks develop ears with red, yellow and white kernels ripening inside. The conditions on these wide plains near the Wind River Range of the Rocky Mountains can be harsh, with high winds, late springs and early autumns limit the growing season to 90 days.    Revere's partner in this effort is Irene Houser, director of the Northern Arapaho Tribe Community Services, who has distributed produce around the Wind River Reservation and to the Pine Ridge Reservation in southwestern South Dakota.    Ms. Houser, a member of the Northern Arapaho Tribe, said people were stunned when she brought them bushels of potatoes, onions, squash, zucchini, tomatoes, beets, radishes and other vegetables.    ``They asked, `How much are you going to charge me for this?''' she recalled. ``I said, `It's free,' and they were so happy. Many of them didn't have food in their house when I showed up.''    Mark Soldierwolf, a 70-year-old father of nine and grandfather of nine, said: ``That food lasted us about six weeks. We dried some things, boiled some. Just ate all of it.''    Soldierwolf, a Northern Arapaho who served with the Marines in World War II and the Korean War, lives with his wife, Florita, one daughter and one grandchild. They receive food stamps for a $300-a-month budget for the four of them, but they said they often fed members of their family who stopped by. Soldierwolf estimated they saved $70 with the free vegetables.    Soldierwolf's family represents the problems that plague many of the 1.3 million American Indians who live on reservations, of whom 49 percent are unemployed. The Bureau of Indian Affairs estimates that at least half the American Indian population lives in poverty. Soldierwolf said all but one of his adult children were unemployed.    ``If you can make it to the end of the month, you're all right,'' Soldierwolf said, referring to the monthly food stamps. ``It's worse than in the Depression.'' Mrs. Soldierwolf nodded, with tears in her eyes.    The Wyoming Department of Family Services says the Northern Arapaho on the reservation account for 18.9 percent of all welfare cases in Wyoming. The reservation population of 12,000 _ including 6,000 Northern Arapaho tribal members, about 3,000 Shoshone tribal members and 3,000 from other tribes _ represents about 2.5 percent of the state's population of 480,000.    Revere and Ms. Houser say the community garden can help reduce dependence on welfare by enabling residents to produce cash crops, traditional Indian plants to be sold on or off the reservation.   "
    doc = nlp(d)
    sentences += [sent.text for sent in doc.sents]
    print(time.time() - startTime)
embeddings = embedder.encode(sentences)
print(time.time() - startTime)

0.10372424125671387
0.18318796157836914
0.27687501907348633
0.35820913314819336
0.4519639015197754
0.530048131942749
0.6248779296875
0.718634843826294
0.876420259475708
0.9560348987579346
21.26010513305664


In [67]:
print(len(embeddings))
print(len(sentences))

28
28


In [78]:
startTime = time.time()
sentences = []
for i in range(10):
    d = "ETHETE, Wyo. _ In the face of cutbacks brought about by the overhaul of welfare programs, members of the Northern Arapaho Tribe on the Wind River Reservation have started a seven-acre community garden with donated land, seeds and equipment to grow vegetables for themselves and for the elderly and disabled who cannot work.    ``We were concerned that when time runs out, when they are no longer eligible for government assistance, what are they going to do for food?'' said Glen Revere, a nutritionist with the Indian Health Services on the 2.8 million-acre Wind River Reservation, about 100 miles east of Jackson, Wyo. ``Then we came up with the idea for this community garden, and it's been bigger than we ever expected in so many ways.''    At the all-volunteer garden, this is the time of year when potatoes grow larger than a man's fist, popping out of the soil, and five-foot-high cornstalks develop ears with red, yellow and white kernels ripening inside. The conditions on these wide plains near the Wind River Range of the Rocky Mountains can be harsh, with high winds, late springs and early autumns limit the growing season to 90 days.    Revere's partner in this effort is Irene Houser, director of the Northern Arapaho Tribe Community Services, who has distributed produce around the Wind River Reservation and to the Pine Ridge Reservation in southwestern South Dakota.    Ms. Houser, a member of the Northern Arapaho Tribe, said people were stunned when she brought them bushels of potatoes, onions, squash, zucchini, tomatoes, beets, radishes and other vegetables.    ``They asked, `How much are you going to charge me for this?''' she recalled. ``I said, `It's free,' and they were so happy. Many of them didn't have food in their house when I showed up.''    Mark Soldierwolf, a 70-year-old father of nine and grandfather of nine, said: ``That food lasted us about six weeks. We dried some things, boiled some. Just ate all of it.''    Soldierwolf, a Northern Arapaho who served with the Marines in World War II and the Korean War, lives with his wife, Florita, one daughter and one grandchild. They receive food stamps for a $300-a-month budget for the four of them, but they said they often fed members of their family who stopped by. Soldierwolf estimated they saved $70 with the free vegetables.    Soldierwolf's family represents the problems that plague many of the 1.3 million American Indians who live on reservations, of whom 49 percent are unemployed. The Bureau of Indian Affairs estimates that at least half the American Indian population lives in poverty. Soldierwolf said all but one of his adult children were unemployed.    ``If you can make it to the end of the month, you're all right,'' Soldierwolf said, referring to the monthly food stamps. ``It's worse than in the Depression.'' Mrs. Soldierwolf nodded, with tears in her eyes.    The Wyoming Department of Family Services says the Northern Arapaho on the reservation account for 18.9 percent of all welfare cases in Wyoming. The reservation population of 12,000 _ including 6,000 Northern Arapaho tribal members, about 3,000 Shoshone tribal members and 3,000 from other tribes _ represents about 2.5 percent of the state's population of 480,000.    Revere and Ms. Houser say the community garden can help reduce dependence on welfare by enabling residents to produce cash crops, traditional Indian plants to be sold on or off the reservation.   "
    doc = nlp(d)
    sentences += [sent for sent in doc.sents]
    print(time.time() - startTime)
print(time.time() - startTime)

0.1017310619354248
0.19051337242126465
0.278278112411499
0.3710050582885742
0.4607667922973633
0.5448489189147949
0.6385982036590576
0.7283575534820557
0.8181192874908447
0.9158577919006348
0.9168541431427002


In [81]:
startTime = time.time()
sentences = []
d = "ETHETE, Wyo. _ In the face of cutbacks brought about by the overhaul of welfare programs, members of the Northern Arapaho Tribe on the Wind River Reservation have started a seven-acre community garden with donated land, seeds and equipment to grow vegetables for themselves and for the elderly and disabled who cannot work.    ``We were concerned that when time runs out, when they are no longer eligible for government assistance, what are they going to do for food?'' said Glen Revere, a nutritionist with the Indian Health Services on the 2.8 million-acre Wind River Reservation, about 100 miles east of Jackson, Wyo. ``Then we came up with the idea for this community garden, and it's been bigger than we ever expected in so many ways.''    At the all-volunteer garden, this is the time of year when potatoes grow larger than a man's fist, popping out of the soil, and five-foot-high cornstalks develop ears with red, yellow and white kernels ripening inside. The conditions on these wide plains near the Wind River Range of the Rocky Mountains can be harsh, with high winds, late springs and early autumns limit the growing season to 90 days.    Revere's partner in this effort is Irene Houser, director of the Northern Arapaho Tribe Community Services, who has distributed produce around the Wind River Reservation and to the Pine Ridge Reservation in southwestern South Dakota.    Ms. Houser, a member of the Northern Arapaho Tribe, said people were stunned when she brought them bushels of potatoes, onions, squash, zucchini, tomatoes, beets, radishes and other vegetables.    ``They asked, `How much are you going to charge me for this?''' she recalled. ``I said, `It's free,' and they were so happy. Many of them didn't have food in their house when I showed up.''    Mark Soldierwolf, a 70-year-old father of nine and grandfather of nine, said: ``That food lasted us about six weeks. We dried some things, boiled some. Just ate all of it.''    Soldierwolf, a Northern Arapaho who served with the Marines in World War II and the Korean War, lives with his wife, Florita, one daughter and one grandchild. They receive food stamps for a $300-a-month budget for the four of them, but they said they often fed members of their family who stopped by. Soldierwolf estimated they saved $70 with the free vegetables.    Soldierwolf's family represents the problems that plague many of the 1.3 million American Indians who live on reservations, of whom 49 percent are unemployed. The Bureau of Indian Affairs estimates that at least half the American Indian population lives in poverty. Soldierwolf said all but one of his adult children were unemployed.    ``If you can make it to the end of the month, you're all right,'' Soldierwolf said, referring to the monthly food stamps. ``It's worse than in the Depression.'' Mrs. Soldierwolf nodded, with tears in her eyes.    The Wyoming Department of Family Services says the Northern Arapaho on the reservation account for 18.9 percent of all welfare cases in Wyoming. The reservation population of 12,000 _ including 6,000 Northern Arapaho tribal members, about 3,000 Shoshone tribal members and 3,000 from other tribes _ represents about 2.5 percent of the state's population of 480,000.    Revere and Ms. Houser say the community garden can help reduce dependence on welfare by enabling residents to produce cash crops, traditional Indian plants to be sold on or off the reservation.   "
sents = sent_tokenize(d)
for i in range(10):
    sentences += [nlp(sent) for sent in sents]
    print(time.time() - startTime)
print(time.time() - startTime)

0.24507784843444824
0.503169059753418
0.7461459636688232
0.9937424659729004
1.2681310176849365
1.5123393535614014
1.7539172172546387
1.9970405101776123
2.243406295776367
2.5046820640563965
2.5046820640563965


In [141]:
model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/user/Code/W2V/GoogleNews-vectors-negative300.bin.gz', binary=True)
#download('punkt') #tokenizer, run once
#download('stopwords') #stopwords dictionary, run once
stop_words = stopwords.words('english')

KeyboardInterrupt: 

In [229]:
def preprocess(text):
    text = text.lower()
    sentences = sent_tokenize(text)
    docTokens = [[t for t in word_tokenize(s) if t not in stop_words and t[-1] not in string.punctuation] for s in sentences]
    #docTokens = [[t for t in word_tokenize(s) if t[-1] not in string.punctuation] for s in sentences]
    
    #doc = [word for word in doc if word not in stop_words]
    #doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return docTokens

def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    if len(doc) > 0:
        return np.mean(word2vec_model[doc], axis=0)
    return None

#d = "ETHETE, Wyo. _ In the face of cutbacks brought about by the overhaul of welfare programs, members of the Northern Arapaho Tribe on the Wind River Reservation have started a seven-acre community garden with donated land, seeds and equipment to grow vegetables for themselves and for the elderly and disabled who cannot work.    ``We were concerned that when time runs out, when they are no longer eligible for government assistance, what are they going to do for food?'' said Glen Revere, a nutritionist with the Indian Health Services on the 2.8 million-acre Wind River Reservation, about 100 miles east of Jackson, Wyo. ``Then we came up with the idea for this community garden, and it's been bigger than we ever expected in so many ways.''    At the all-volunteer garden, this is the time of year when potatoes grow larger than a man's fist, popping out of the soil, and five-foot-high cornstalks develop ears with red, yellow and white kernels ripening inside. The conditions on these wide plains near the Wind River Range of the Rocky Mountains can be harsh, with high winds, late springs and early autumns limit the growing season to 90 days.    Revere's partner in this effort is Irene Houser, director of the Northern Arapaho Tribe Community Services, who has distributed produce around the Wind River Reservation and to the Pine Ridge Reservation in southwestern South Dakota.    Ms. Houser, a member of the Northern Arapaho Tribe, said people were stunned when she brought them bushels of potatoes, onions, squash, zucchini, tomatoes, beets, radishes and other vegetables.    ``They asked, `How much are you going to charge me for this?''' she recalled. ``I said, `It's free,' and they were so happy. Many of them didn't have food in their house when I showed up.''    Mark Soldierwolf, a 70-year-old father of nine and grandfather of nine, said: ``That food lasted us about six weeks. We dried some things, boiled some. Just ate all of it.''    Soldierwolf, a Northern Arapaho who served with the Marines in World War II and the Korean War, lives with his wife, Florita, one daughter and one grandchild. They receive food stamps for a $300-a-month budget for the four of them, but they said they often fed members of their family who stopped by. Soldierwolf estimated they saved $70 with the free vegetables.    Soldierwolf's family represents the problems that plague many of the 1.3 million American Indians who live on reservations, of whom 49 percent are unemployed. The Bureau of Indian Affairs estimates that at least half the American Indian population lives in poverty. Soldierwolf said all but one of his adult children were unemployed.    ``If you can make it to the end of the month, you're all right,'' Soldierwolf said, referring to the monthly food stamps. ``It's worse than in the Depression.'' Mrs. Soldierwolf nodded, with tears in her eyes.    The Wyoming Department of Family Services says the Northern Arapaho on the reservation account for 18.9 percent of all welfare cases in Wyoming. The reservation population of 12,000 _ including 6,000 Northern Arapaho tribal members, about 3,000 Shoshone tribal members and 3,000 from other tribes _ represents about 2.5 percent of the state's population of 480,000.    Revere and Ms. Houser say the community garden can help reduce dependence on welfare by enabling residents to produce cash crops, traditional Indian plants to be sold on or off the reservation.   "
d1 = "I don't have a dog. The cat outside is black and fluffy. The dog I have is not so big. The cat I have is small. The cat I have is big. The cats I have are big. The black cats I have are big."
corpus = [preprocess(d1) for i in range(10)]
sentenceEmbeddings = []
for docIdx, docTokens in enumerate(corpus):
    sentenceEmbeddings.append([])
    for sent in docTokens:
        sentenceEmbeddings[docIdx].append(document_vector(model, sent))

In [230]:
d_spacy = nlp(d1)
sents_spacy = [s for s in d_spacy.sents]
for i in range(len(corpus[0])):
    for j in range(i+1, len(corpus[0])):
        print(corpus[0][i])
        print(corpus[0][j])
        print(sklearn.metrics.pairwise.cosine_similarity([sentenceEmbeddings[0][i], sentenceEmbeddings[0][j]])[0][1])
        print(sents_spacy[i].similarity(sents_spacy[j]))
        print(sklearn.metrics.pairwise.cosine_similarity([sents_spacy[i].vector, sents_spacy[j].vector])[0][1])
        print('---')

["n't", 'dog']
['cat', 'outside', 'black', 'fluffy']
0.48444045
0.80493516
0.8049351
---
["n't", 'dog']
['dog', 'big']
0.54372936
0.9495517
0.9495516
---
["n't", 'dog']
['cat', 'small']
0.4669138
0.92193633
0.9219362
---
["n't", 'dog']
['cat', 'big']
0.4360654
0.9304207
0.9304208
---
["n't", 'dog']
['cats', 'big']
0.40971175
0.92046446
0.9204642
---
["n't", 'dog']
['black', 'cats', 'big']
0.41645414
0.9025238
0.902524
---
['cat', 'outside', 'black', 'fluffy']
['dog', 'big']
0.6123663
0.8694334
0.8694329
---
['cat', 'outside', 'black', 'fluffy']
['cat', 'small']
0.73846847
0.8926117
0.89261144
---
['cat', 'outside', 'black', 'fluffy']
['cat', 'big']
0.69621253
0.8831175
0.88311756
---
['cat', 'outside', 'black', 'fluffy']
['cats', 'big']
0.61127144
0.84373266
0.8437325
---
['cat', 'outside', 'black', 'fluffy']
['black', 'cats', 'big']
0.7786544
0.88210356
0.8821036
---
['dog', 'big']
['cat', 'small']
0.7053906
0.9619369
0.96193695
---
['dog', 'big']
['cat', 'big']
0.8590902
0.97700256
0

In [228]:
sentenceEmbeddings

[[None,
  array([ 2.91900635e-02,  6.72828332e-02,  4.68800850e-02,  9.51004028e-02,
         -4.73480225e-02, -5.74289970e-02, -2.12402344e-02, -1.55614214e-02,
          3.65702319e-03,  2.53747310e-02,  7.86888599e-02, -6.73053265e-02,
         -7.14403763e-02,  5.43187447e-02, -8.24604034e-02, -2.90171299e-02,
         -2.54096985e-02,  1.14121757e-01,  6.98254928e-02,  4.23812866e-02,
          2.06486392e-03,  7.57783279e-02,  7.46968612e-02,  3.84305306e-02,
          4.83309440e-02, -7.80560151e-02, -1.61763504e-01, -1.86258946e-02,
         -1.97690334e-02, -1.14730835e-01, -7.00429305e-02, -1.84122715e-02,
         -8.74628648e-02, -6.11572266e-02,  5.22778817e-02, -8.08741227e-02,
          1.67280827e-02, -5.84513359e-02,  2.63214111e-02,  6.35223389e-02,
          5.71187353e-03, -9.27352905e-03,  6.51156083e-02, -8.86789989e-03,
         -7.05057755e-02, -1.93483993e-01, -1.32672623e-01, -2.29066219e-02,
         -7.31519088e-02, -1.03060408e-02,  4.14811783e-02, -4.30984

In [221]:
#spacyVectors = {}
#def document_vector_spacy(doc):
#    tokenVectors = []
#    for word in doc:
#        if word not in spacyVectors:
#            spacyVectors[word] = nlp(word).vector
#        tokenVectors.append(spacyVectors[word])
#    return np.mean(tokenVectors)

def document_vector_spacy(doc):
    #for w in doc:
    #    if nlp.vocab.has_vector(w):
    #       print(nlp.vocab.get_vector(w))
    
    return np.mean([nlp.vocab.get_vector(w) for w in doc if nlp.vocab.has_vector(w)], axis=0)

    
start = time.time()
sentenceEmbeddings = []
for docIdx, docTokens in enumerate(corpus):
    sentenceEmbeddings.append([])
    for sent in docTokens:
        sentenceEmbeddings[docIdx].append(document_vector_spacy(sent))
    #print(sentenceEmbeddings[docIdx])
    print(time.time() - start)
print(time.time() - start)

#print(sentenceEmbeddings[0])

for i in range(len(corpus[0])):
    for j in range(i+1, len(corpus[0])):
        #print(sentenceEmbeddings[0][i])
        #print(sentenceEmbeddings[0][j])
        #if sentenceEmbeddings[0][i] != 'nan' and sentenceEmbeddings[0][j] != 'nan':
        sim = sklearn.metrics.pairwise.cosine_similarity([sentenceEmbeddings[0][i], sentenceEmbeddings[0][j]])[0][1]
        if sim > 0.8:
            print(corpus[0][i])
            print(corpus[0][j])
            print(sim)
            print('---')

0.006979942321777344
0.013961553573608398
0.018947362899780273
0.02396249771118164
0.02894425392150879
0.033907413482666016
0.039914846420288086
0.04385018348693848
0.04883766174316406
0.052850961685180664
0.052850961685180664
['face', 'cutbacks', 'brought', 'overhaul', 'welfare', 'programs', 'members', 'northern', 'arapaho', 'tribe', 'wind', 'river', 'reservation', 'started', 'seven-acre', 'community', 'garden', 'donated', 'land', 'seeds', 'equipment', 'grow', 'vegetables', 'elderly', 'disabled', 'work']
['concerned', 'time', 'runs', 'longer', 'eligible', 'government', 'assistance', 'going', 'food']
0.82152593
---
['face', 'cutbacks', 'brought', 'overhaul', 'welfare', 'programs', 'members', 'northern', 'arapaho', 'tribe', 'wind', 'river', 'reservation', 'started', 'seven-acre', 'community', 'garden', 'donated', 'land', 'seeds', 'equipment', 'grow', 'vegetables', 'elderly', 'disabled', 'work']
['came', 'idea', 'community', 'garden', "'s", 'bigger', 'ever', 'expected', 'many', 'ways']
0

In [222]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [235]:
d_spacy

I don't have a dog. The cat outside is black and fluffy. The dog I have is not so big. The cat I have is small. The cat I have is big. The cats I have are big. The black cats I have are big.

In [239]:
print(len(word_tokenize(d_spacy.text)))
print(len(d_spacy))

53
53
