In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import string
from collections import Counter
from pprint import pprint

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 

In [24]:
children = pd.read_csv('children_stories.Csv', encoding= 'unicode_escape')
children

Unnamed: 0,names,cats,desc
0,HIDE AND SEEK,Age 2-9,Was it just another game of hide and seek? No....
1,GINGER THE GIRAFFE,Age 2-9,Read this warm tale of camaraderie and affecti...
2,DOING MY CHORES,Age 2-9,Love shines through this great illustrated kid...
3,ABE THE SERVICE DOG,Age 2-9,Abe was a real Service Dog who dedicated his l...
4,SUNNY MEADOWS WOODLAND SCHOOL,Age 2-9,The class took a little train and went deep in...
...,...,...,...
425,Carrying the Elephant: A Memoir of Love and Loss,Age 11+,In the 72 prose poems that make up this unusua...
426,War and Peas,Age 8+,Nearly forty years after its original appearan...
427,Love that Dog,Age 9-12,"Jack has a great sadness in his life, but he i..."
428,A Pilgrim's Progress,Age 9+,'I had a dream last night ... large enough to ...


In [25]:
children.duplicated().sum()

8

# Stopwords and Stemming

In [26]:
stopWords = set(stopwords.words('english'))
children['desc'] = children['desc'].apply(lambda x: " ".join([item for item in (x.lower()).split(" ") if item not in stopWords]))
print(children['desc'])

0      another game hide seek? no. not. first fell de...
1      read warm tale camaraderie affection set wild ...
2      love shines great illustrated kidsâ book . r...
3      abe real service dog dedicated life assisting ...
4      class took little train went deep woods first ...
                             ...                        
425    72 prose poems make unusual moving collection,...
426    nearly forty years original appearance, shamef...
427    jack great sadness life, share feelings anyone...
428    'i dream last night ... large enough fill rest...
429    new school, rafaella suffers daily name-callin...
Name: desc, Length: 430, dtype: object


In [27]:
pstemmer = PorterStemmer()
children['desc'] = children['desc'].apply(lambda x: pstemmer.stem(x))
print(children['desc'])

0      another game hide seek? no. not. first fell de...
1      read warm tale camaraderie affection set wild ...
2      love shines great illustrated kidsâ book . r...
3      abe real service dog dedicated life assisting ...
4      class took little train went deep woods first ...
                             ...                        
425    72 prose poems make unusual moving collection,...
426    nearly forty years original appearance, shamef...
427    jack great sadness life, share feelings anyone...
428    'i dream last night ... large enough fill rest...
429    new school, rafaella suffers daily name-callin...
Name: desc, Length: 430, dtype: object


# Bag of words and tfIdf

In [28]:
wordsbag = CountVectorizer()
Xwordsbag = wordsbag.fit_transform(children['desc'])
print("Bag of Words:", Xwordsbag.shape)

Bag of Words: (430, 6920)


In [29]:
vectorizer = TfidfVectorizer()
XtfIdf = vectorizer.fit_transform(children['desc'])
print("Words Vectorizer:", XtfIdf.shape)

Words Vectorizer: (430, 6920)


# Tokenization

In [30]:
text = children['desc'][0]
text

'another game hide seek? no. not. first fell deep, dark hole ground found treasure. end there? no! not. read thrilling adventure sally friends free illustrated kidsâ\x80\x99 book. fun never ends sallyâ\x80\x99s around! '

In [31]:
ListWords = []
for word in text.split():
    while word[0] in string.punctuation:  word = word[1:]
    while word[-1] in string.punctuation: word = word[:-1]
    ListWords.append(word.lower())        

In [32]:
dictionaryWords = {}
vocabulary_size = 0

for word in ListWords:
    if word not in dictionaryWords:
        dictionaryWords[word] = vocabulary_size
        vocabulary_size += 1
        
pprint(dictionaryWords)

{'adventure': 18,
 'another': 0,
 'around': 29,
 'book': 24,
 'dark': 9,
 'deep': 8,
 'end': 14,
 'ends': 27,
 'fell': 7,
 'first': 6,
 'found': 12,
 'free': 21,
 'friends': 20,
 'fun': 25,
 'game': 1,
 'ground': 11,
 'hide': 2,
 'hole': 10,
 'illustrated': 22,
 'kidsâ\x80\x99': 23,
 'never': 26,
 'no': 4,
 'not': 5,
 'read': 16,
 'sally': 19,
 'sallyâ\x80\x99s': 28,
 'seek': 3,
 'there': 15,
 'thrilling': 17,
 'treasure': 13}


# One-hot encoding

In [33]:
def one_hot(word, dictionaryWords):
    vector = np.zeros(len(dictionaryWords))
    vector[dictionaryWords[word]] = 1
    return vector
print(one_hot("adventure", dictionaryWords))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]


# Bag of words

In [20]:
vectorText = np.zeros(vocabulary_size)

for word in ListWords: vectorText[dictionaryWords[word]] += 1
    
vectorText[dictionaryWords["another"]]

word_counts = Counter(ListWords)
pprint(word_counts)

Counter({'no': 2,
         'not': 2,
         'another': 1,
         'game': 1,
         'hide': 1,
         'seek': 1,
         'first': 1,
         'fell': 1,
         'deep': 1,
         'dark': 1,
         'hole': 1,
         'ground': 1,
         'found': 1,
         'treasure': 1,
         'end': 1,
         'there': 1,
         'read': 1,
         'thrilling': 1,
         'adventure': 1,
         'sally': 1,
         'friends': 1,
         'free': 1,
         'illustrated': 1,
         'kidsâ\x80\x99': 1,
         'book': 1,
         'fun': 1,
         'never': 1,
         'ends': 1,
         'sallyâ\x80\x99s': 1,
         'around': 1})


In [34]:
items = list(word_counts.items())
dictionaryWords2 = dict([[items[i][0], i] for i in range(len(items))])
pprint(dictionaryWords2)

{'adventure': 18,
 'another': 0,
 'around': 29,
 'book': 24,
 'dark': 9,
 'deep': 8,
 'end': 14,
 'ends': 27,
 'fell': 7,
 'first': 6,
 'found': 12,
 'free': 21,
 'friends': 20,
 'fun': 25,
 'game': 1,
 'ground': 11,
 'hide': 2,
 'hole': 10,
 'illustrated': 22,
 'kidsâ\x80\x99': 23,
 'never': 26,
 'no': 4,
 'not': 5,
 'read': 16,
 'sally': 19,
 'sallyâ\x80\x99s': 28,
 'seek': 3,
 'there': 15,
 'thrilling': 17,
 'treasure': 13}
