Problem: Text preprocessing and analysis     
Steps:    


1.   Uploading txt file and reading it into a list
2.   Text Processing      
    Removed Blank lines           
    Took only text and Converted it to lower case         
    Removed leading spaces and new line character               
    Splitting the sentence in words         
    Applied porter stemmer        
    Removed stopwords            
    Joined words into sentences and made cleaned corpus     
3. Tokenize sentences into words
4. Word Embedding: Using BOW model
    1. Countvectorizer : Made vector representation of a corpus
    2. word2vec: Generated word Embedding of a corpus


In [1]:
from google.colab import files
uploaded = files.upload()

Saving got1.txt to got1.txt


In [2]:
f = open("got1.txt","r+")
lines = [line for line in f.readlines()]
f.close()

In [3]:
lines

['This edition contains the complete text of the original hardcover edition.\n',
 '\n',
 'NOT ONE WORD HAS BEEN OMITTED.\n',
 '\n',
 'A CLASH OF KINGS\n',
 '\n',
 'A Bantam Spectra Book\n',
 '\n',
 'PUBLISHING HISTORY\n',
 '\n',
 'Bantam Spectra hardcover edition published February 1999\n',
 '\n',
 'Bantam Spectra paperback edition / September 2000\n',
 '\n',
 'SPECTRA and the portrayal of a boxed “s” are trademarks of Bantam Books, a division of Random House, Inc.\n',
 '\n',
 'All rights reserved.\n',
 '\n',
 'Copyright © 1999 by George R. R. Martin.\n',
 '\n',
 'Maps by James Sinclair.\n',
 '\n',
 'Heraldic crest by Virginia Norey.\n',
 '\n',
 'Library of Congress Catalog Card Number: 98-37954.\n',
 '\n',
 'No part of this book may be reproduced or transmitted in any form or by any means, electronic or mechanical, including photocopying, recording, or by any information storage and retrieval system, without permission in writing from the publisher.\n',
 '\n',
 'Visit our website at w

In [4]:
import re   
import nltk  
  
nltk.download('stopwords') 
  
from nltk.corpus import stopwords  
from nltk.stem.porter import PorterStemmer 
 
 
corpus = []

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
for i in range(0, len(lines)):  
    if lines[i] != "\n":

      # column : "text", row ith 
      text = re.sub('[^a-zA-Z]', ' ', lines[i])  
      
      # Remove the leading spaces and newline character
      text = text.strip() 

      # convert all cases to lower cases 
      text = text.lower()  
        
      # split to array(default delimiter is " ") 
      text = text.split()  
        
      # creating PorterStemmer object to 
      # take main stem of each word 
      ps = PorterStemmer()  
        
      # loop for stemming each word 
      # in string array at ith row     
      text = [ps.stem(word) for word in text 
                  if not word in set(stopwords.words('english'))]  
                    
      # rejoin all string array elements 
      # to create back into a string 
      text = ' '.join(text)   
        
      # append each string to create 
      # array of clean text  
      corpus.append(text)  
    

In [6]:
corpus

['edit contain complet text origin hardcov edit',
 'one word omit',
 'clash king',
 'bantam spectra book',
 'publish histori',
 'bantam spectra hardcov edit publish februari',
 'bantam spectra paperback edit septemb',
 'spectra portray box trademark bantam book divis random hous inc',
 'right reserv',
 'copyright georg r r martin',
 'map jame sinclair',
 'herald crest virginia norey',
 'librari congress catalog card number',
 'part book may reproduc transmit form mean electron mechan includ photocopi record inform storag retriev system without permiss write publish',
 'visit websit www bantamdel com',
 'bantam book rooster colophon spectra portray box regist trademark random hous inc',
 'eisbn',
 'v r',
 'content',
 'cover',
 'titl page',
 'copyright',
 'dedic',
 'map',
 'prologu',
 'arya',
 'sansa',
 'tyrion',
 'bran',
 'arya',
 'jon',
 'catelyn',
 'tyrion',
 'arya',
 'davo',
 'theon',
 'daeneri',
 'jon',
 'arya',
 'tyrion',
 'bran',
 'tyrion',
 'sansa',
 'arya',
 'tyrion',
 'bran',
 

In [7]:
# Creating the Bag of Words model 
from sklearn.feature_extraction.text import CountVectorizer 
  
# To extract max 150 feature. 
# "max_features" is attribute to 
# experiment with to get better results 
cv = CountVectorizer(max_features = 150)  
  
# txt contains corpus 
txt = cv.fit_transform(corpus).toarray()  
txt  


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
pip install -U gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/44/52/f1417772965652d4ca6f901515debcd9d6c5430969e8c02ee7737e6de61c/gensim-4.0.1-cp37-cp37m-manylinux1_x86_64.whl (23.9MB)
[K     |████████████████████████████████| 23.9MB 1.9MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.1


In [9]:
pip install -U sklearn

Requirement already up-to-date: sklearn in /usr/local/lib/python3.7/dist-packages (0.0)


In [13]:
# import the existing word and sentence tokenizing  
# libraries 
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
#Defining training data
sentences = []
for i in range(0,len(corpus)):
  sentences.append(word_tokenize(corpus[i]))
sentences

[['edit', 'contain', 'complet', 'text', 'origin', 'hardcov', 'edit'],
 ['one', 'word', 'omit'],
 ['clash', 'king'],
 ['bantam', 'spectra', 'book'],
 ['publish', 'histori'],
 ['bantam', 'spectra', 'hardcov', 'edit', 'publish', 'februari'],
 ['bantam', 'spectra', 'paperback', 'edit', 'septemb'],
 ['spectra',
  'portray',
  'box',
  'trademark',
  'bantam',
  'book',
  'divis',
  'random',
  'hous',
  'inc'],
 ['right', 'reserv'],
 ['copyright', 'georg', 'r', 'r', 'martin'],
 ['map', 'jame', 'sinclair'],
 ['herald', 'crest', 'virginia', 'norey'],
 ['librari', 'congress', 'catalog', 'card', 'number'],
 ['part',
  'book',
  'may',
  'reproduc',
  'transmit',
  'form',
  'mean',
  'electron',
  'mechan',
  'includ',
  'photocopi',
  'record',
  'inform',
  'storag',
  'retriev',
  'system',
  'without',
  'permiss',
  'write',
  'publish'],
 ['visit', 'websit', 'www', 'bantamdel', 'com'],
 ['bantam',
  'book',
  'rooster',
  'colophon',
  'spectra',
  'portray',
  'box',
  'regist',
  'trade

In [22]:
from gensim.models import Word2Vec
# train model
model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.index_to_key)
print(words)
# access vector for one word
print(model.wv['king'])
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec(vocab=8233, vector_size=100, alpha=0.025)
['lord', 'said', 'would', 'one', 'ser', 'could', 'man', 'king', 'like', 'men', 'hand', 'tyrion', 'back', 'well', 'look', 'see', 'brother', 'come', 'never', 'even', 'eye', 'know', 'black', 'thought', 'father', 'time', 'want', 'jon', 'old', 'made', 'arya', 'face', 'ladi', 'boy', 'told', 'head', 'long', 'wall', 'say', 'way', 'bran', 'go', 'theon', 'make', 'call', 'son', 'must', 'take', 'stanni', 'red', 'us', 'day', 'god', 'night', 'might', 'great', 'hors', 'came', 'still', 'turn', 'maester', 'good', 'sword', 'knight', 'seem', 'ask', 'saw', 'stark', 'dead', 'two', 'away', 'littl', 'need', 'tell', 'yet', 'took', 'sansa', 'name', 'lannist', 'think', 'command', 'girl', 'catelyn', 'though', 'renli', 'watch', 'three', 'fire', 'cloak', 'keep', 'castl', 'joffrey', 'water', 'half', 'queen', 'blood', 'die', 'enough', 'kill', 'knew', 'heard', 'stone', 'left', 'gave', 'white', 'end', 'much', 'went', 'around', 'arm', 'first', 'everi', 'across', 'let',