## Prepare Dependencies

In [1]:
import pandas as pd
import nltk
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer

from os import listdir
from os.path import isfile, join

from gensim import corpora, models

In [2]:
#Global Variables
nlp = spacy.load('en')

## Helper Functions

In [3]:
def string_maker(file_path):
    with open(str(file_path), 'r') as f_open:
        results = f_open.read()
    return results

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

## Wrangle Data

In [4]:
file_names = []
for file in listdir('saga/english/'):
    file_names.append(str(file))

print(file_names)

['laxdaela_saga.en2.txt', 'kormaks_saga.en.txt', 'haensna-thoris_saga.en2.txt', 'havardar_saga_isfirdings.en.txt', 'haensna-thoris_saga.en.txt', 'viga-glums_saga.en.txt', 'hrafnkels_saga_freysgoda.en.txt', 'viglundar_saga.en.txt', 'faereyinga_saga.en.txt', 'grettis_saga.en2.txt', 'laxdaela_saga.en.txt', 'egils_saga.en.txt', 'bandamanna_saga.en2.txt', 'heidarviga_saga.en.txt', 'grettis_saga.en.txt', 'eyrbyggja_saga.en.txt', 'thordar_saga_hredu.en.txt', 'eiriks_saga_rauda.en.txt', 'gunnlaugs_saga_ormstungu.en.txt', 'bandamanna_saga.en.txt', 'brennu-njals_saga.en.txt', 'gisla_saga_surssonar.en.txt']


In [5]:
#Import the text from every document as an entry in a dataframe
corpus = []
raw = pd.DataFrame()

for book in file_names:
    corpus.append(string_maker('saga/english/{}'.format(str(book))))

raw['saga_name'] = file_names
raw['text'] = corpus

In [6]:
raw.head()

Unnamed: 0,saga_name,text
0,laxdaela_saga.en2.txt,\nThe Story of the Laxdalers\n1903 translation...
1,kormaks_saga.en.txt,\nThe Saga of Cormac the Skald\n1901 translati...
2,haensna-thoris_saga.en2.txt,\nHænsa-Thori's Saga\n2002 translation into En...
3,havardar_saga_isfirdings.en.txt,\nThe Story of Howard the Halt\n1891 translati...
4,haensna-thoris_saga.en.txt,\nThe Story of Hen-Thorir\n1891 translation in...


## Clean Text

In [7]:
def book_lines(raw, num):
    
    lines = []
    
    for i, line in enumerate(raw.loc[0, 'text'].split('\n')):
        
        if len(line) > 0 and 'translation into English' not in line and not line.isspace():
            lines.append(line)
        
    return lines

lines = book_lines(raw, 0)

def chapter_by_lines(lines):
    
    book_title = lines[0]
    chapter_titles = []
    chapter_texts = []
    
    book = pd.DataFrame()
    
    ch = -1
    
    for i, line in enumerate(lines):
        
        if i == 0:
            print(line)
            continue
        
        if 'Chapter' in line and any(char.isdigit() for char in line):
            chapter_titles.append(line)
            chapter_texts.append('')
            ch += 1
            
            continue
            print('Unreachable print statement')
        
        if len(line) > 0 and 'translation into English' not in line and not line.isspace():
            chapter_texts[ch] += line
        
    book['chapter_title'] = chapter_titles
    book['chapter_text'] = chapter_texts
    
    return book

Laxdalers = chapter_by_lines(lines)
Laxdalers.head()

The Story of the Laxdalers


Unnamed: 0,chapter_title,chapter_text
0,Chapter 1,"Ketil Flatneb hight a man, the son of Bjorn Ro..."
1,Chapter 2,In the latter days of Ketil came to pass the r...
2,Chapter 3,After that had Ketil a very great guesting. Th...
3,Chapter 4,Ketil Flatneb came in his ship to Scotland and...
4,Chapter 5,Now Unn makes ready to depart from the Sheep-i...


In [8]:
tfidf = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english',
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

## Topic Extraction

In [None]:
def tfidf_this(series, tfidf):
    x = tfidf.fit_transform(series)
    tfidf_results = pd.DataFrame(index = tfidf.vocabulary_)
    tfidf_results['score'] = tfidf.idf_
    return tfidf_results

