In [1]:
import numpy as np
import guidedlda
from sklearn.feature_extraction.text import CountVectorizer
import operator

# X = guidedlda.datasets.load_data(guidedlda.datasets.NYT)
# vocab = guidedlda.datasets.load_vocab(guidedlda.datasets.NYT)

# read in the necessary files
data = []
with open('forevermissed_story_final.txt', 'r') as rows:
    for r in rows:
        data.append(r.strip())

# now form the document frequency matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data) # X is a matrix of dimensions (num documents, length of vocab) where X[i,j] is the number of times word j appears in doc i
print(X[0:50])
freq = np.ravel(X.sum(axis=0)) # freq is an array of ints where freq[j] is the frequency word j appeared across all documents

# extract vocab list from X
# get vocabulary keys, sorted by value
vocab = [v[0] for v in sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1))]
# print(vocab[0:50])
fdist = dict(zip(vocab, freq)) # return same `format as nltk

# word2id = dict((v, idx) for idx, v in enumerate(vocab))

def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#     print(words_freq)
    return words_freq[:n]


# print(get_top_n_words(data, 25))

vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)





  (0, 7027)	1
  (0, 6364)	1
  (0, 3677)	1
  (0, 12238)	1
  (0, 19145)	1
  (0, 17572)	1
  (0, 11366)	1
  (0, 7492)	1
  (0, 1181)	1
  (0, 15496)	1
  (0, 17146)	1
  (0, 9042)	1
  (0, 9321)	1
  (1, 8944)	1
  (1, 13448)	1
  (1, 9799)	1
  (1, 10176)	1
  (1, 14168)	1
  (1, 8478)	1
  (1, 1893)	1
  (1, 11027)	1
  (1, 16245)	1
  (1, 13468)	1
  (1, 15454)	1
  (1, 1670)	1
  :	:
  (49, 1071)	1
  (49, 19581)	1
  (49, 13111)	1
  (49, 13929)	1
  (49, 18365)	1
  (49, 15002)	1
  (49, 12046)	2
  (49, 16101)	1
  (49, 16118)	1
  (49, 6999)	1
  (49, 2763)	1
  (49, 2661)	1
  (49, 152)	1
  (49, 4599)	1
  (49, 15054)	1
  (49, 2893)	1
  (49, 7985)	1
  (49, 14809)	1
  (49, 11084)	1
  (49, 19784)	1
  (49, 6475)	2
  (49, 16697)	1
  (49, 10176)	1
  (49, 7027)	1
  (49, 6364)	1
[('love', 2260), ('time', 1700), ('know', 1529), ('life', 1347), ('miss', 1122), ('family', 1088), ('like', 993), ('loved', 948), ('remember', 858), ('years', 847), ('said', 714), ('good', 706), ('heart', 643), ('home', 590), ('little', 570), 

In [32]:
import csv

# import the necessary modules
import re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

import nltk
nltk.download('words')
nltk.download('wordnet')

lifeweb_text = []

with open('lifeweb_sampletext.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in spamreader:
        lifeweb_text.append(' '.join(row))
#         print(' '.join(row))

for item in lifeweb_text:
    item = item.strip('\"')
#     print(item)
    
print(lifeweb_text[0])  


english_text = []
words = set(nltk.corpus.words.words())

for item in lifeweb_text:
    curr = " ".join(w for w in nltk.wordpunct_tokenize(item) \
                 if w.lower() in words or not w.isalpha())
    english_text.append(curr)

stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
            # result.append(lemmatize_stemming(token))
    return result

final_text = []
for item in english_text:
    # lemmatized_content = lemmatize_stemming(item)
    preprocessed_content = preprocess(item)
    output_content = " ".join(preprocessed_content)
    if not output_content:
        next
    else:
        final_text.append(output_content)
    
print(final_text)
    
with open('lemma_lifeweb_text.txt', 'w') as f:
    for item in final_text:
        f.write(str(item) + '\n')            
f.close()


﻿"Thank you so much for sharing this sad news with us.  I understand your ability to see the silver lining, but all those whose lives she touched so profoundly will surely miss her more than we can imagine. Tony, Marney, Julian, Governor, Michael,  our hearts go out to all of you, as we know she was so much more than a mother to you, but a dear friend. Your parents were such an important part of our lives as young newlyweds starting a new life in Alexandria - and here we are, nearly 50 years later, times filled with joy, mirth and sadness as we lose the generation who made our young lives so special through their many acts of kindness.  We will never forget that, ever. Creole and i look forward to catching up with all of you in April to celebrate the life of a lovely person who made the world a better place. Love, Bo"
['thank news understand ability silver lining touched profoundly surely miss imagine tony governor hearts know mother dear friend important young starting life nearly lat

[nltk_data] Downloading package words to /Users/aea/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/aea/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
