## Analyzing a corpus of books using natural language processing (NLP)

### Books about Istanbul are scraped and is then further processed to get sentences based on the object list

In [2]:
import pprint
import nltk.corpus
import random
import os
import re
import gensim

In [3]:
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.models import Word2Vec as w2v
from sklearn.decomposition import PCA

In [4]:
from gensim import corpora

In [5]:
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [6]:
folder_path = "/content/drive/MyDrive/Istanbul_Books"

In [7]:
def break_sentences(text):
    # replace newline characters with spaces
    text = re.sub(r'\n', ' ', text)

    # split the text into sentences based on .?! characters
    sentences = re.split(r'[.?!]+', text)

    # remove empty strings and leading/trailing spaces from sentences
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

In [8]:
def clean_string(s):
    # remove any sequence of more than two spaces with a single space
    s = re.sub(r' {2,}', ' ', s)

    return s.strip()

In [9]:
collection = []
read_books = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # read the file contents
        with open(os.path.join(folder_path, filename), 'r', encoding="utf-8") as file:
            text = file.read()
        
        # break the text into sentences
        sentences = break_sentences(text)
        
        # add the sentences to the collection list
        collection += sentences
        
        # add the file name to the read_books list
        read_books.append(filename)
        
# print the names of the files that were read
print(f"Read {len(read_books)} files:")

# print the sentences
for i, sentence in enumerate(collection):
    print(f"Index {i}: {sentence}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Index 234739: ” That was how we’d put it when we were young, and blacked out at parties from too much drink, referring to those maddening interruptions at the cinema, when the projectionist’s life was in danger
Index 234740: How I left the bathroom, how I regained my seat, with what excuse Çetin had come upstairs and coaxed me through the door, of these things I have no recollection
Index 234741: There was also a silence at the table; I so remember that, but whether it was owing to the rain having eased up, or to my embarrassment, which could no longer be hidden or ignored, or simply to the defeat that was fast destroying me, with the pain that had become tangible-this I cannot say
Index 234742: Far from being unnerved by the silence, the son-in-law was enthusing about the film business-perhaps I’d actually said my reel had snapped and he’d taken his cue from this-with a mixture of love and loathing, saying how bad Turkis

In [10]:
final_collection = []

for sentence in collection:
    cleaned_sentence = clean_string(sentence)
    final_collection.append(cleaned_sentence)

for i, sentence in enumerate(final_collection):
    print(f"Index {i}: {sentence}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Index 234739: ” That was how we’d put it when we were young, and blacked out at parties from too much drink, referring to those maddening interruptions at the cinema, when the projectionist’s life was in danger
Index 234740: How I left the bathroom, how I regained my seat, with what excuse Çetin had come upstairs and coaxed me through the door, of these things I have no recollection
Index 234741: There was also a silence at the table; I so remember that, but whether it was owing to the rain having eased up, or to my embarrassment, which could no longer be hidden or ignored, or simply to the defeat that was fast destroying me, with the pain that had become tangible-this I cannot say
Index 234742: Far from being unnerved by the silence, the son-in-law was enthusing about the film business-perhaps I’d actually said my reel had snapped and he’d taken his cue from this-with a mixture of love and loathing, saying how bad Turkis

In [11]:
len(final_collection)

239739

In [12]:
stoplist = set("and the or but nor yet so been now will are were would should did dont ever with was had have has".split(' '))

texts = [[word.replace(".","").replace(",","").replace("'","").replace(":","") for word in document.lower().split()] 
         for document in final_collection]

texts = [[word for word in text if (word not in stoplist and len(word)>2)] 
         for text in texts]

to_delete = []
for i in range(len(texts)):
    t = texts[i]
    test = [w for w in t if w.isalpha()]
    if len(test) < 5:
        to_delete.append(i)
    else:
        texts[i] = test

for i in sorted(to_delete, reverse = True):
    del texts[i]
    del final_collection[i]
        
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  'house',
  'attic',
  'domain',
  'mice',
  'spiders',
  'cockroaches',
  'dark',
  'mildewy',
  'home',
  'water',
  'become',
  'clean',
  'bright',
  'room',
  'open',
  'stars',
  'skylight'],
 ['wanted',
  'sleep',
  'surrounded',
  'all',
  'things',
  'that',
  'reminded',
  'füsun',
  'made',
  'feel',
  'her',
  'presence',
  'that',
  'spring',
  'evening',
  'used',
  'key',
  'new',
  'door',
  'dalgıç',
  'street',
  'enter',
  'house',
  'that',
  'metamorphosed',
  'into',
  'museum',
  'like',
  'ghost',
  'climbed',
  'long',
  'straight',
  'staircase',
  'throwing',
  'myself',
  'upon',
  'bed',
  'attic',
  'fell',
  'asleep'],
 ['some',
  'fill',
  'their',
  'dwellings',
  'objects',
  'time',
  'their',
  'lives',
  'coming',
  'end',
  'turn',
  'their',
  'houses',
  'into',
  'museums'],
 ['having',
  'turned',
  'another',
  'house',
  'into',
  'museum',
  'presence',
  'bed',
  'room',
  'v

In [13]:
len(processed_corpus)

166551

In [14]:
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary<43345 unique tokens: ['architecture', 'art', 'certain', 'city', 'culture']...>


In [15]:
print(dictionary.token2id)



In [16]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [17]:
from gensim import models

tfidf = models.TfidfModel(bow_corpus)

In [18]:
from gensim.similarities import Similarity
from gensim.test.utils import get_tmpfile

In [19]:
index_tmpfile = get_tmpfile("index")

index = Similarity(index_tmpfile, bow_corpus, num_features = len(dictionary))

In [20]:
query_document = "balcony".lower().split()
query_bow = dictionary.doc2bow(query_document)

In [21]:
sims = index[query_bow]

In [22]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
161530 0.0
161531 0.0
161532 0.0
161533 0.0
161534 0.0
161535 0.0
161536 0.0
161537 0.0
161538 0.0
161539 0.0
161540 0.0
161541 0.0
161542 0.0
161543 0.0
161544 0.0
161545 0.0
161546 0.0
161547 0.0
161548 0.0
161549 0.0
161550 0.0
161551 0.0
161552 0.0
161553 0.0
161554 0.0
161555 0.0
161556 0.0
161557 0.0
161558 0.0
161559 0.0
161560 0.0
161561 0.0
161562 0.0
161563 0.0
161564 0.0
161565 0.0
161566 0.0
161567 0.0
161568 0.0
161569 0.0
161570 0.0
161571 0.0
161572 0.0
161573 0.0
161574 0.0
161575 0.0
161576 0.0
161577 0.0
161578 0.0
161579 0.0
161580 0.0
161581 0.0
161582 0.0
161583 0.0
161584 0.0
161585 0.0
161586 0.0
161587 0.0
161588 0.0
161589 0.0
161590 0.0
161591 0.0
161592 0.0
161593 0.0
161594 0.0
161595 0.0
161596 0.0
161597 0.0
161598 0.0
161599 0.0
161600 0.0
161601 0.0
161602 0.0
161603 0.0
161604 0.0
161605 0.0
161606 0.0
161607 0.0
161608 0.0
161609 0.0
161610 0.0
161611 0.0
161612 0.0
161613 0.0
161614 0.0


In [23]:
top_scores = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:10]

In [24]:
top_sentences = [final_collection[idx] for idx, score in top_scores]

# combine the top paragraphs and their corresponding books into a single string
output = ""
for i in range(len(top_sentences)):
    output += f"Sentence {i+1}: {top_sentences[i]}\n"
        
# write the output to a file named "top_paragraphs.txt"
with open("./balcony.txt", "w", encoding="utf-8") as f:
    f.write(output)

print(output)

Sentence 1: Nalan leaned over the balcony railing
Sentence 2: ‘You can’t stand on the balcony and throw things at strangers
Sentence 3: Women often use a screened-off area or a balcony
Sentence 4: The views from its upper-storey balcony are stunning
Sentence 5: “Come out here,” Joy calls from the balcony
Sentence 6: Macit went out onto the balcony
Sentence 7: No, they did not swing from the balcony
Sentence 8: No, they did not hang off the balcony
Sentence 9: Flat Number 7: Me Ants raided my balcony today – or perhaps it was just today that I noticed ants had raided my balcony
Sentence 10: The dried aubergines were hanging from the kitchen balcony

