In [2]:
#here is a program to test vector search. It is currently set up to read in a .txt file of the 1000 most common english words. Then, it will perform sentiment analysis on 
#each of these words. It assigns each word 4 coordinates based on this sentiment analysis and assigns each word a position within an array. After forming the array,
#the user is prompted to enter a word. The program performs sentiment analysis on this word then finds its 5 nearest neighbors using a KDTree.

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np
from numpy import array

#set up sentiment analysis
sia = SentimentIntensityAnalyzer()

#vectorize a long list of words. Returns dict where key = word and value = coordinates.
def vectorize(list):
    vector_dict = {}
    for item in list:
        score = sia.polarity_scores(item)
        positive_score = score['pos']
        negative_score = score['neg']
        neutral_score = score['neu']
        compound_score = score['compound']
        vector_list = [positive_score, neutral_score, negative_score, compound_score]
        vector_dict[item] = vector_list
    return(vector_dict)

#vectorize a single word (user input). Returns list of coordinates.
def sent_analysis(str):
    score = sia.polarity_scores(str)
    positive_score = score['pos']
    negative_score = score['neg']
    neutral_score = score['neu']
    compound_score = score['compound']
    vector_list = [positive_score, neutral_score, negative_score, compound_score]
    return(vector_list)

In [3]:
from sklearn.neighbors import KDTree

#specify filename
file_name = "common_words.txt"

#open file, read contents, put them in a list
with open(file_name, 'r') as file:
    file_contents = []
    for line in file:
        file_contents.append(line.strip())

#vectorize the list of items in the file
element_dict = vectorize(file_contents)

#this list contains only the keys (i.e. the words themselves)
words = list(element_dict.keys())

#this array contains only the values (i.e. the coordinates)
vectors = np.array(list(element_dict.values()))

#create a KDTree based on the array of vectors
tree = KDTree(vectors, leaf_size=40)
    

In [29]:
#prompt user for a new word
new_word = input("New Word: ")

#perform sentiment analysis on new word to get coordinates
new_word_coordinates = np.array([sent_analysis(new_word)])
print(f"the coordinates of {new_word} are {new_word_coordinates}")


# Find the index of the nearest 5 vectors
distances, indices = tree.query(new_word_coordinates, k=5)
nearest_word_indices = indices[0]
nearest_words = [words[i] for i in nearest_word_indices]

#print the nearest 5 vectors
print(f"The 5 nearest words to {new_word} and their distances:")
for i, word in enumerate(nearest_words):
    print(f"{i+1}: {word} (Distance: {distances[0][i]})")

the coordinates of odd are [[ 0.      0.      1.     -0.3182]]
The 5 nearest words to odd and their distances:
1: lost (Distance: 0.0)
2: fire (Distance: 0.02180000000000004)
3: gun (Distance: 0.02180000000000004)
4: stop (Distance: 0.022199999999999998)
5: no (Distance: 0.022199999999999998)


In [55]:
import random


import time
from imsg import *

NUMBER = '2103050549'
SPACER = ''
STACK = [convert]
REPEATS = 6
INTERVAL = 3600

bigwords = []
for word in file_contents:
    if sent_analysis(word)[2] > 0.5:
        bigwords.append(word)

for i in range(REPEATS):
    startword = random.choice(bigwords)
    startword_coords = np.array([sent_analysis(startword)])
    dist, idx = tree.query(startword_coords, k=47)
    nearest_word_indices = idx[0]
    nearest_words = [words[i] for i in nearest_word_indices]
    msg = ' '.join(nearest_words)
    send_message(NUMBER, stacker(SPACER.join([c for c in msg]), *STACK))
    time.sleep(INTERVAL)

KeyboardInterrupt: 

In [47]:
bigwords


['like',
 'good',
 'great',
 'help',
 'well',
 'play',
 'hand',
 'kind',
 'ease',
 'care',
 'friend',
 'sure',
 'ready',
 'best',
 'better',
 'true',
 'interest',
 'love',
 'certain',
 'beauty',
 'free',
 'strong',
 'special',
 'clear',
 'laugh',
 'yes',
 'grand',
 'energy',
 'wish',
 'joy',
 'fun',
 'bright',
 'happy',
 'hope',
 'safe',
 'value',
 'excite',
 'natural',
 'surprise',
 'cool',
 'smile',
 'join',
 'clean',
 'fit',
 'fair',
 'save',
 'gentle',
 'please',
 'protect',
 'party',
 'agree',
 'rich',
 'create',
 'fresh',
 'success',
 'pretty',
 'solution',
 'thank',
 'huge',
 'win',
 'favor',
 'glad',
 'original',
 'share',
 'dear',
 'support']

In [56]:
sent_analysis('acetal')

[0.0, 1.0, 0.0, 0.0]