Skip to content

Commit 6e977db

Browse files
authored
Add files via upload
1 parent 67de212 commit 6e977db

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+572549
-0
lines changed

more/nlp_class/article_spinner.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# Very basic article spinner for NLP class, which can be found at:
2+
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
3+
# https://www.udemy.com/data-science-natural-language-processing-in-python
4+
5+
# Author: http://lazyprogrammer.me
6+
7+
# A very bad article spinner using trigrams.
8+
from __future__ import print_function, division
9+
from future.utils import iteritems
10+
from builtins import range
11+
# Note: you may need to update your version of future
12+
# sudo pip install -U future
13+
14+
15+
import nltk
16+
import random
17+
import numpy as np
18+
19+
from bs4 import BeautifulSoup
20+
21+
22+
# load the reviews
23+
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
24+
positive_reviews = BeautifulSoup(open('electronics/positive.review').read())
25+
positive_reviews = positive_reviews.findAll('review_text')
26+
27+
28+
# extract trigrams and insert into dictionary
29+
# (w1, w3) is the key, [ w2 ] are the values
30+
trigrams = {}
31+
for review in positive_reviews:
32+
s = review.text.lower()
33+
tokens = nltk.tokenize.word_tokenize(s)
34+
for i in range(len(tokens) - 2):
35+
k = (tokens[i], tokens[i+2])
36+
if k not in trigrams:
37+
trigrams[k] = []
38+
trigrams[k].append(tokens[i+1])
39+
40+
# turn each array of middle-words into a probability vector
41+
trigram_probabilities = {}
42+
for k, words in iteritems(trigrams):
43+
# create a dictionary of word -> count
44+
if len(set(words)) > 1:
45+
# only do this when there are different possibilities for a middle word
46+
d = {}
47+
n = 0
48+
for w in words:
49+
if w not in d:
50+
d[w] = 0
51+
d[w] += 1
52+
n += 1
53+
for w, c in iteritems(d):
54+
d[w] = float(c) / n
55+
trigram_probabilities[k] = d
56+
57+
58+
def random_sample(d):
59+
# choose a random sample from dictionary where values are the probabilities
60+
r = random.random()
61+
cumulative = 0
62+
for w, p in iteritems(d):
63+
cumulative += p
64+
if r < cumulative:
65+
return w
66+
67+
68+
def test_spinner():
69+
review = random.choice(positive_reviews)
70+
s = review.text.lower()
71+
print("Original:", s)
72+
tokens = nltk.tokenize.word_tokenize(s)
73+
for i in range(len(tokens) - 2):
74+
if random.random() < 0.2: # 20% chance of replacement
75+
k = (tokens[i], tokens[i+2])
76+
if k in trigram_probabilities:
77+
w = random_sample(trigram_probabilities[k])
78+
tokens[i+1] = w
79+
print("Spun:")
80+
print(" ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))
81+
82+
83+
if __name__ == '__main__':
84+
test_spinner()
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
2+
# https://www.udemy.com/data-science-natural-language-processing-in-python
3+
4+
# Author: http://lazyprogrammer.me
5+
6+
# Get the data from here:
7+
# https://lazyprogrammer.me/course_files/moby_dick.txt
8+
9+
### encode a message
10+
11+
# this is a random excerpt from Project Gutenberg's
12+
# The Adventures of Sherlock Holmes, by Arthur Conan Doyle
13+
# https://www.gutenberg.org/ebooks/1661
14+
15+
original_message = '''I then lounged down the street and found,
16+
as I expected, that there was a mews in a lane which runs down
17+
by one wall of the garden. I lent the ostlers a hand in rubbing
18+
down their horses, and received in exchange twopence, a glass of
19+
half-and-half, two fills of shag tobacco, and as much information
20+
as I could desire about Miss Adler, to say nothing of half a dozen
21+
other people in the neighbourhood in whom I was not in the least
22+
interested, but whose biographies I was compelled to listen to.
23+
'''

0 commit comments

Comments
 (0)