In [52]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import nltk
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenizer = TreebankWordDetokenizer()
stemmer = SnowballStemmer("english")

def get_synonyms(word):
    synonyms = []
    antonyms = []

    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
            if l.antonyms():
                antonyms.append(l.antonyms()[0].name())

    synonyms = list(set(synonyms))
    try:
        synonyms.remove(word)
    except:
        pass
    return synonyms

def punnify_sentence(sentence, use_synonyms=False, is_causal=False, destroy_punless=False, explain=False):
    sentence = sentence[::]
    has_pun = False
    
    for i in range(len(sentence)):
        tgt_word = sentence[i]
        for j in range(len(sentence)):
            src_word = sentence[j]
            # If causal is true, only punnified if we have already read the src word
            if j > i and is_causal:
                continue
            # Don't punnify when the source word is too short
            if len(src_word) <= 3:
                continue
            # Don't punnify when the words come from same root words
            if stemmer.stem(src_word) == stemmer.stem(tgt_word):
                continue
            ## TODO: Use part of speech tagging to skip certain parts of speech
#             if nltk.pos_tag([src_word])=="???":
#                 pass
            
            synonyms = get_synonyms(src_word)
            candidate_sources = [src_word] + (synonyms if use_synonyms else [])
            for candidate_source in candidate_sources:
                embeddable, result = embed_word(candidate_source, tgt_word)
                if embeddable:
                    print(src_word, tgt_word)
                    if candidate_source==src_word:
                        explanation = f" ((Orig={src_word}, Source={tgt_word})) "
                    else:
                        explanation = f" ((Orig={src_word}, Source={tgt_word}, Embedded={candidate_source})) "
                    sentence[i] = result + (explanation if explain else "")
                    has_pun = True
            if tgt_word in candidate_sources:
                embeddable, result = embed_word(src_word, tgt_word)
                if embeddable:
                    print(result)
                print("Intrasentence Synonym found:", src_word, tgt_word)
    
    if not has_pun and destroy_punless:
        return []
    return sentence

def embed_word(src, tgt, min_src_len=4):
    if len(src) >= min_src_len and len(tgt) > len(src):
        for i in range(len(tgt) - len(src) + 1):
            num_same_letters = 0
            for s in range(len(src)):
                if src[s]==tgt[s + i]:
                    num_same_letters += 1
            if num_same_letters >= len(src) - 1:
                print("Embed:", src, tgt, tgt[:i] + src.upper() + tgt[i + s + 1:])
                return True, tgt[:i] + src.upper() + tgt[i + s + 1:]

    return False, None

def flatten(listt):
    return [item for sublist in listt for item in sublist]

# sentence = ["The", "lad", "was", "a", "happy", "grad", "student", "who", "ate", "potatoes", "glad", "gladly"]

### Main Code
f=open("input.txt", "r", encoding="utf8")
string = f.read()

tokenized = [word_tokenize(sent) for sent in sent_tokenize(string)]
print(tokenized)

use_synonyms = False
is_causal = False
destroy_punless = False
explain = False

punnified = [punnify_sentence(sent,
                              use_synonyms=use_synonyms,
                              is_causal=is_causal,
                              destroy_punless=destroy_punless,
                              explain=explain) for sent in tokenized]

detokenized = "\n\n".join([detokenizer.detokenize(_) for _ in punnified])

f = open("output.txt", "w", encoding="utf8")
f.write(detokenized)
f.close()

print(detokenized)


[['or', 'decades', ',', 'a', 'family', 'of', 'crystals', 'has', 'stumped', 'physicists', 'with', 'its', 'baffling', 'ability', 'to', 'superconduct', '—', 'that', 'is', ',', 'carry', 'an', 'electric', 'current', 'without', 'any', 'resistance', '—', 'at', 'far', 'warmer', 'temperatures', 'than', 'other', 'materials', '.'], ['Now', ',', 'an', 'experiment', 'years', 'in', 'the', 'making', 'has', 'directly', 'visualized', 'superconductivity', 'on', 'the', 'atomic', 'scale', 'in', 'one', 'of', 'these', 'crystals', ',', 'finally', 'revealing', 'the', 'cause', 'of', 'the', 'phenomenon', 'to', 'nearly', 'everyone', '’', 's', 'satisfaction', '.'], ['Electrons', 'appear', 'to', 'nudge', 'each', 'other', 'into', 'a', 'frictionless', 'flow', 'in', 'a', 'manner', 'first', 'suggested', 'by', 'a', 'venerable', 'theory', 'nearly', 'as', 'old', 'as', 'the', 'mystery', 'itself', '.'], ['“', 'This', 'evidence', 'is', 'really', 'beautiful', 'and', 'direct', ',', '”', 'said', 'Subir', 'Sachdev', ',', 'a', '

Embed: they their THEYr
they their
Embed: with without WITHout
with without
Embed: 1957 mid-1950s mid-1957s
1957 mid-1950s
Embed: their theory THEIRy
their theory
Embed: rows electrons electROWS
rows electrons
Embed: come becomes beCOMEs
come becomes
Embed: also absolute ALSOlute
also absolute
Embed: they others oTHEYs
they others
Embed: then others oTHENs
then others
Embed: late laureate laurLATE
late laureate
Embed: after matter mAFTER
after matter
Embed: moment momentum MOMENTum
moment momentum
Embed: lower lower-energy LOWER-energy
lower lower-energy
Embed: that situations siTHATions
that situations
Embed: when between betWHEN
when between
Embed: stay distance diSTAYce
stay distance
Embed: they theories THEYries
they theories
Embed: Cork Cornell CORKell
Cork Cornell
Embed: level developed LEVELoped
level developed
Embed: they rather raTHEY
they rather
Embed: their theorists THEIRists
their theorists
Embed: they theorists THEYrists
they theorists
Embed: they their THEYr
they their
E

In [15]:
get_synonyms("lack")

['miss', 'deficiency', 'want']

In [21]:
tokenized

[['The',
  'passage',
  'was',
  'a',
  'long',
  'one',
  ',',
  'and',
  'seemed',
  'to',
  'pervade',
  'the',
  'whole',
  'square',
  'basement',
  'of',
  'the',
  'Manor',
  'House',
  '.'],
 ['We',
  'traversed',
  'but',
  'one',
  'side',
  'of',
  'the',
  'square',
  ',',
  'however',
  ',',
  'and',
  'at',
  'the',
  'end',
  'of',
  'it',
  'she',
  'stopped',
  ',',
  'and',
  'put',
  'her',
  'candle',
  'down',
  'and',
  'opened',
  'a',
  'door',
  '.'],
 ['Here',
  ',',
  'the',
  'daylight',
  'reappeared',
  ',',
  'and',
  'I',
  'found',
  'myself',
  'in',
  'a',
  'small',
  'paved',
  'courtyard',
  ',',
  'the',
  'opposite',
  'side',
  'of',
  'which',
  'was',
  'formed',
  'by',
  'a',
  'detached',
  'dwelling-house',
  ',',
  'that',
  'looked',
  'as',
  'if',
  'it',
  'had',
  'once',
  'belonged',
  'to',
  'the',
  'manager',
  'or',
  'head',
  'clerk',
  'of',
  'the',
  'extinct',
  'brewery',
  '.'],
 ['There',
  'was',
  'a',
  'clock',
  

In [None]:

# from nltk.corpus import reuters
# from gensim.test.utils import common_texts, get_tmpfile
# from gensim.models import Word2Vec

# sentences = reuters.sents()
# model = Word2Vec(sentences, size=100, window=5, min_count=1, workers=4)
# model.save('reuters_model')

# model = Word2Vec.load('reuters_model')
#
# result = model.most_similar(positive=['boat'], topn=10)
# print(result)