In [15]:
from collections import Counter
import os
import re
import pandas as pd
from ipywidgets import IntProgress
from IPython.display import display
# Stanford NLP library 
# https://stanfordnlp.github.io/stanfordnlp/installation_usage.html
import stanfordnlp
nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma')

## Data preparation

Function which extract only words from joined files

In [143]:
def words(text): 
    return re.findall(r'[a-zA-Z]+', text.lower())

Creating Counter dictionary, it shows summed up number of all words which occur in specific joined file

In [144]:
base_path = r"C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files"
counter_dict = {}
for century in os.listdir(base_path):
    century_path = base_path + "\{}".format(century)
    counter_dict[century] = pl_books = Counter(words(open(century_path, encoding = 'utf-8').read()))

C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files\17th_joined_file.txt
C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files\18th_joined_file.txt
C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files\19th_joined_file.txt
C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files\20th_joined_file.txt


## Defining functions for analysis of century specific files

In [145]:
century_text = counter_dict["17th_joined_file.txt"]

In [147]:
## Creating dataframe out of dictionary 

In [148]:
data = []
for word, number in century_text.items():
    data.append((word, number))

df = pd.DataFrame(data, columns = ["Word", "Number of occurences"])

In [86]:
## Sorting df

In [102]:
df = df.sort_values(by = "Number of occurences", ascending = False)
df.reset_index(inplace = True)
df.drop(columns = ["index"], inplace = True)

## Lemmatization

We introduce lemmatization to restrain number of words for future steps like POS-tagging or Sentiment Analysis. Different variations of the same word do not bring us any interesting information in the area of our study so it is better to cut down unnecessary diversity. 

In [113]:
# f is for progress bar
def get_lemma(text, f):
    f.value += 1
    doc = nlp(text)
    for sent in doc.sentences:
        for word in sent.words:
            return word.lemma
        

In [114]:
f = IntProgress(min= 0, max = len(df)) # instantiate the bar
display(f)


df["Lemma"] = df["Word"].apply(lambda text:
                              get_lemma(text, f))

IntProgress(value=0, max=35192)

## Creating lemmatized dataframe

After process of lemmatization we can group our dataframe in a way that in further analysis we will be focused only on lemmatized versions of words.

In [169]:
df_lemmatized = df.groupby("Lemma").aggregate("sum")

In [170]:
df_lemmatized.sort_values(by = "Number of occurences", ascending = False, inplace = True)
df_lemmatized.reset_index(inplace = True)

Now when our dataset has been limited we can conduct further actions (e.g. POS-tagging and sentiment analysis)

## POS-Tagging

In [173]:
# f is for progress bar
def get_part_of_speech(text, f):
    f.value += 1
    doc = nlp(text)
    for sent in doc.sentences:
        for word in sent.words:
            return word.upos

In [176]:
f = IntProgress(min= 0, max = len(df_lemmatized)) # instantiate the bar
display(f)
df_lemmatized["Part_of_speech"] = df_lemmatized["Lemma"].apply(lambda text: get_part_of_speech(text, f))

IntProgress(value=0, max=27327)

In [178]:
## Sentiment Analysis

In [179]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [182]:
sid = SentimentIntensityAnalyzer()

In [200]:
sentiment_score = pd.DataFrame(list(df_lemmatized["Lemma"].apply(lambda text: sid.polarity_scores(text))))

In [203]:
df_lemmatized = df_lemmatized.join(sentiment_score)

# Operations above but for general purpose, all cases

In [211]:
from collections import Counter
import os
import re
import pandas as pd
from ipywidgets import IntProgress
from IPython.display import display
# Stanford NLP library 
# https://stanfordnlp.github.io/stanfordnlp/installation_usage.html
import stanfordnlp
nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\grzeg\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': 'C:\\Users\\grzeg\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tagger.pt', 'pretrain_path': 'C:\\Users\\grzeg\\stanfordnlp_resources\\en_ewt_models\\en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': 'C:\\Users\\grzeg\\stanfordnlp_resources\\en_ewt_models\\en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
Done loading processors!
---


In [212]:
# Read data 

def words(text): 
    return re.findall(r'[a-zA-Z]+', text.lower())

base_path = r"C:\Users\grzeg\Desktop\studia\Data Science\2 rok\semestr 1\Advanced_VisualisationR\projekt\Adv_Vis_R_Project\Joined_files"
counter_dict = {}
for century in os.listdir(base_path):
    century_path = base_path + "\{}".format(century)
    counter_dict[century] = pl_books = Counter(words(open(century_path, encoding = 'utf-8').read()))

In [213]:
# Lemmatization

#### f is for progress bar
def get_lemma(text, f):
    f.value += 1
    doc = nlp(text)
    for sent in doc.sentences:
        for word in sent.words:
            return word.lemma

# Part Of Speech tagging

#### f is for progress bar
def get_part_of_speech(text, f):
    f.value += 1
    doc = nlp(text)
    for sent in doc.sentences:
        for word in sent.words:
            return word.upos


In [214]:
def prepare_dataset(key):
    print(key)
    century_text = counter_dict[key]
    
    ## Creating dataframe out of counter dictionary 
    data = []
    for word, number in century_text.items():
        data.append((word, number))

    df = pd.DataFrame(data, columns = ["Word", "Number of occurences"])
    
    
    ## Sorting df
    df = df.sort_values(by = "Number of occurences", ascending = False)
    df.reset_index(inplace = True)
    df.drop(columns = ["index"], inplace = True)
    
    
    ## Lemmatization 
    
    f = IntProgress(min= 0, max = len(df))
    display(f)


    df["Lemma"] = df["Word"].apply(lambda text:
                              get_lemma(text, f))
    
    ## Creating Lemmatized Dataset
    
    df_lemmatized = df.groupby("Lemma").aggregate("sum")
    df_lemmatized.sort_values(by = "Number of occurences", ascending = False, inplace = True)
    df_lemmatized.reset_index(inplace = True)
    
    ## POS-tagging
    f = IntProgress(min= 0, max = len(df_lemmatized)) # instantiate the bar
    display(f)
    df_lemmatized["Part_of_speech"] = df_lemmatized["Lemma"].apply(lambda text: get_part_of_speech(text, f))
    
    
    
    ## Sentiment Analysis
    sid = SentimentIntensityAnalyzer()
    sentiment_score = pd.DataFrame(list(df_lemmatized["Lemma"].apply(lambda text: sid.polarity_scores(text))))
    df_lemmatized = df_lemmatized.join(sentiment_score)
    
    
    ## Saving data
    file_name = key.replace(".txt", ".xlsx")
    df_lemmatized.to_excel(file_name, index = False )

In [216]:
for key in counter_dict.keys():
    prepare_dataset(key)

17th_joined_file.txt


IntProgress(value=0, max=30564)

IntProgress(value=0, max=23010)

18th_joined_file.txt


IntProgress(value=0, max=58719)

IntProgress(value=0, max=47978)

19th_joined_file.txt


IntProgress(value=0, max=40645)

IntProgress(value=0, max=29909)

20th_joined_file.txt


IntProgress(value=0, max=25010)

IntProgress(value=0, max=18246)