## Prepare Dependencies

In [2]:
import pandas as pd
import nltk
import spacy
import gensim

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize 

from os import listdir
from os.path import isfile, join

#Stopwords dictionary
stop = nltk.corpus.stopwords.words('english')

In [3]:
#Global Variables
nlp = spacy.load('en')

## Helper Functions

In [16]:
def string_maker(file_path):
    with open(str(file_path), 'r') as f_open:
        results = f_open.read()
    return results

def word_parser(series):
    texts = []
    
    for entry in series:
        
        #Lowercase
        raw = entry.lower()
        
        #tokenize
        tokens = w_tokenizer.tokenize(raw)
        
        #Removing stopwords and lemmatization
        
        stemmed_tokens = []
        for i in tokens:
            if i not in stop:
                stemmed_tokens.append(lemmatizer.lemmatize(i))
        
        #Add to list
        texts.append(stemmed_tokens)
        
    return texts
    
def tokenizer(text):
    
    cleaned_text = text.replace('\n', ' ')
    
    tokens = []
    for i in sent_tokenize(cleaned_text):
        
        words = []
        for j in word_tokenize(i):
            words.append(j.lower())
        
        tokens.append(words)
    
    return tokens

In [6]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

## Wrangle Data

In [7]:
file_names = []
for file in listdir('saga/english/'):
    file_names.append(str(file))

print(file_names)

['laxdaela_saga.en2.txt', 'kormaks_saga.en.txt', 'haensna-thoris_saga.en2.txt', 'havardar_saga_isfirdings.en.txt', 'haensna-thoris_saga.en.txt', 'viga-glums_saga.en.txt', 'hrafnkels_saga_freysgoda.en.txt', 'viglundar_saga.en.txt', 'faereyinga_saga.en.txt', 'grettis_saga.en2.txt', 'laxdaela_saga.en.txt', 'egils_saga.en.txt', 'bandamanna_saga.en2.txt', 'heidarviga_saga.en.txt', 'grettis_saga.en.txt', 'eyrbyggja_saga.en.txt', 'thordar_saga_hredu.en.txt', 'eiriks_saga_rauda.en.txt', 'gunnlaugs_saga_ormstungu.en.txt', 'bandamanna_saga.en.txt', 'brennu-njals_saga.en.txt', 'gisla_saga_surssonar.en.txt']


In [8]:
#Import the text from every document as an entry in a dataframe
corpus = []
raw = pd.DataFrame()

for book in file_names:
    corpus.append(string_maker('saga/english/{}'.format(str(book))))

raw['saga_name'] = file_names
raw['text'] = corpus

In [9]:
raw.head(20)

Unnamed: 0,saga_name,text
0,laxdaela_saga.en2.txt,\nThe Story of the Laxdalers\n1903 translation...
1,kormaks_saga.en.txt,\nThe Saga of Cormac the Skald\n1901 translati...
2,haensna-thoris_saga.en2.txt,\nHænsa-Thori's Saga\n2002 translation into En...
3,havardar_saga_isfirdings.en.txt,\nThe Story of Howard the Halt\n1891 translati...
4,haensna-thoris_saga.en.txt,\nThe Story of Hen-Thorir\n1891 translation in...
5,viga-glums_saga.en.txt,\nThe Saga of Viga-Glum\n1866 translation into...
6,hrafnkels_saga_freysgoda.en.txt,"\nThe Story of Hrafnkell, Frey's Priest\n1882 ..."
7,viglundar_saga.en.txt,\nThe Saga of Viglund the Fair\n1901 translati...
8,faereyinga_saga.en.txt,\nThe Saga of Thrond of Gate\n1896 translation...
9,grettis_saga.en2.txt,\nGrettir's Saga\n1914 translation into Englis...


## Clean Text

In [10]:
def book_lines(raw, num):
    
    lines = []
    
    for i, line in enumerate(raw.loc[num, 'text'].split('\n')):
        
        if len(line) > 0 and 'translation into English' not in line and not line.isspace():
            lines.append(line)
        
    return lines

def chapter_by_lines(lines):
    
    book_title = lines[0]
    chapter_titles = []
    chapter_texts = []
    
    book = pd.DataFrame()
    
    ch = -1
    
    for i, line in enumerate(lines):
        
        if i == 0:
            print(line)
            continue
        
        if 'Chapter' in line and any(char.isdigit() for char in line):
            chapter_titles.append(line)
            chapter_texts.append('')
            ch += 1
            
            continue
            print('Unreachable print statement')
        
        if len(line) > 0 and 'translation into English' not in line and not line.isspace():
            chapter_texts[ch] += line
        
    book['chapter_title'] = chapter_titles
    book['chapter_text'] = chapter_texts
    
    return book



In [11]:
lines = book_lines(raw, 11)
Egil = chapter_by_lines(lines)
Egil.head()

Egil's Saga


Unnamed: 0,chapter_title,chapter_text
0,Chapter 1 - Of Kveldulf and his sons.,"There was a man named Ulf, son of Bjalf, and H..."
1,Chapter 2 - Of Aulvir Hnuf.,Audbjorn was then king over the Firthfolk; the...
2,Chapter 3 - The beginning of the rule of Harol...,"Harold, son of Halfdan Swarthy, was heir after..."
3,Chapter 4 - Battle of king Harold and Audbjorn.,King Audbjorn went with his forces northwards ...
4,Chapter 5 - The king's message to Kveldulf.,"King Harold lay with his fleet in the Firths, ..."


In [12]:
tfidf = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english',
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

## Topic Extraction

In [13]:
def tfidf_this(series, tfidf):
    x = tfidf.fit_transform(series)
    tfidf_results = pd.DataFrame(index = tfidf.vocabulary_)
    tfidf_results['score'] = tfidf.idf_
    return tfidf_results

In [14]:
series = Egil['chapter_text'].values

res = tfidf_this(series,tfidf)

In [15]:
res.sort_values('score', ascending = False).head(20)

Unnamed: 0,score
church,4.433987
gift,4.433987
bit,4.433987
wit,4.433987
hroaldsson,4.433987
thanked,4.433987
adding,4.433987
bjorn,4.433987
naming,4.433987
poetry,4.433987


In [None]:
#LDA Model
texts = word_parser(Egil['chapter_text'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics= 10, id2word = dictionary, passes=20)


In [None]:
print(ldamodel.print_topics(num_topics = 10, num_words=20)[0])

print(type(corpus))