In [15]:
from collections import Counter
import os
import re
import pandas as pd
from ipywidgets import IntProgress
from IPython.display import display
# Stanford NLP library 
# https://stanfordnlp.github.io/stanfordnlp/installation_usage.html
import stanfordnlp
nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma')

## Data preparation

Function which extract only words from joined files

In [143]:
def words(text): 
    return re.findall(r'[a-zA-Z]+', text.lower())

Creating Counter dictionary, it shows summed up number of all words which occur in specific joined file

In [144]:
base_path = r"C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files"
counter_dict = {}
for century in os.listdir(base_path):
    century_path = base_path + "\{}".format(century)
    counter_dict[century] = pl_books = Counter(words(open(century_path, encoding = 'utf-8').read()))

C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files\17th_joined_file.txt
C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files\18th_joined_file.txt
C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files\19th_joined_file.txt
C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files\20th_joined_file.txt


## Defining functions for analysis of century specific files

In [145]:
century_text = counter_dict["17th_joined_file.txt"]

In [147]:
## Creating dataframe out of dictionary 

In [148]:
data = []
for word, number in century_text.items():
    data.append((word, number))

df = pd.DataFrame(data, columns = ["Word", "Number of occurences"])

In [86]:
## Sorting df

In [102]:
df = df.sort_values(by = "Number of occurences", ascending = False)
df.reset_index(inplace = True)
df.drop(columns = ["index"], inplace = True)

In [88]:
## Extracting stopwords to simplify analysis. It is unnecessary to perform later actions like POS-tagging or sentiment analysis on stopwords

In [89]:
# import nltk
# ## First you have to download stopwords with the code commented below
# # nltk.download("stopwords")
# from nltk.corpus import stopwords
# stopwords = set(stopwords.words('english'))

In [90]:
# def find_stopwords(text, stopwords):
#     if text in stopwords:
#         return 1
#     else:
#         return 0

In [91]:
# df["Stopwords"] = df["Word"].apply(lambda text:
#                                      find_stopwords(text, stopwords))

## Lemmatization

We introduce lemmatization to restrain number of words for future steps like POS-tagging or Sentiment Analysis. Different variations of the same word do not bring us any interesting information in the area of our study so it is better to cut down unnecessary diversity. 

In [113]:
# f is for progress bar
def get_lemma(text, f):
    f.value += 1
    doc = nlp(text)
    for sent in doc.sentences:
        for word in sent.words:
            return word.lemma
        

In [114]:
f = IntProgress(min= 0, max = len(df)) # instantiate the bar
display(f)


df["Lemma"] = df["Word"].apply(lambda text:
                              get_lemma(text, f))

IntProgress(value=0, max=35192)

## Creating lemmatized dataframe

After process of lemmatization we can group our dataframe in a way that in further analysis we will be focused only on lemmatized versions of words.

In [158]:
df

Unnamed: 0,Word,Number of occurences,Stopwords,Lemma,Word2
0,the,47214,1,the,the
1,and,35721,1,and,and
2,of,28815,1,of,of
3,to,28572,1,to,to
4,in,16513,1,in,in
5,i,16106,1,i,i
6,that,15865,1,that,that
7,a,15799,1,a,a
8,it,11123,1,it,it
9,is,10921,1,be,is


In [92]:
## Getting parts of speech

In [44]:
def get_part_of_speech(text):
    doc = nlp(text)
    for sent in doc.sentences:
        for word in sent.words:
            return word.upos