# Data Analysis phrase_functions No Order

In this first analysis, we will take a look a the data without looking at the order of the words. 
This means that we will only consider the frequency of words, sentences length and the frequency of certain clauses.

In [2]:
import glob
import pandas as pd
import re 
import numpy as np
import operator
import collections


## Loading the data

To make life easier, we can load all phrase functions in one move. This makes life easier because we do not want to do this process for each book seperately, because the phrase_functions of the books are in abstract all the same. All phrase_files are in one folder maned phrase_functions.

In [3]:
phrase_functions_books_direction = glob.glob("Data/phrase_functions/*.txt")

#Sample
phrase_functions_books_direction[:3]

['Data/phrase_functions\\phrase_functions_Chronica_I.txt',
 'Data/phrase_functions\\phrase_functions_Chronica_II.txt',
 'Data/phrase_functions\\phrase_functions_Deuteronomium.txt']

Now, we can create a list with the clauses for each book. The books list contains the clauses for each book seperately. The book name indicate which book we are talking about.

In [4]:
books = []
book_names = []
for book in phrase_functions_books_direction:
    bookFunctions = []
    with open(book) as inputfile:
        for line in inputfile:
            bookFunctions.append(line.strip().split(' '))
    books.append(bookFunctions) 
    bookname = re.search('phrase_functions_(.*?).txt', book).group(1)
    book_names.append(bookname)

#Sample
bookindex = 0
print(book_names[bookindex])
print(books[bookindex][:3])

Chronica_I
[['Conj', 'Pred', 'Subj', 'Objc'], ['Subj', 'PreC'], ['Subj', 'PreC']]


Now, we can extract the unique function names. This is done in a set because a set does not contain duplicates. Hereafter, we sort these function names and put it in a list for a pandas df.

In [5]:
uniqueFunctions_book = []
for book in books:
    setFunctions = set()
    for sentence in book:
        for function in sentence:
            setFunctions.add(function)
    uniqueFunctions = sorted(list(setFunctions))
    uniqueFunctions_book.append(uniqueFunctions)

#Sample
bookindex = 0
print(book_names[bookindex])
print(uniqueFunctions_book[bookindex])  

Chronica_I
['Adju', 'Cmpl', 'Conj', 'Loca', 'Modi', 'Nega', 'Objc', 'PreC', 'PreO', 'PreS', 'Pred', 'Rela', 'Subj', 'Time']


To create a pandas dataframe, we need to know in which clause which function name occurs. Therefore, each row represents a clause and a number in a column represents the occurence of a function in a clause.

In [6]:
df_frequency = []
for index,book in enumerate(books):
    unique_functions = uniqueFunctions_book[index]
    number_of_clauses = len(book)
    number_of_unique_function = len(unique_functions)
    
    zeromatrix= np.zeros((number_of_clauses, number_of_unique_function))
    for row_index, sentence in enumerate(book):
        for function in sentence:
            column_index = uniqueFunctions_book[index].index(function) 
            zeromatrix[row_index][column_index]+=1
    
    df = pd.DataFrame(zeromatrix, columns = unique_functions)
    df.index.name = "Clause"
    df_frequency.append(df)
    
#Sample 
bookindex = 0
print(book_names[bookindex])
df_frequency[bookindex][:5]

Chronica_I


Unnamed: 0_level_0,Adju,Cmpl,Conj,Loca,Modi,Nega,Objc,PreC,PreO,PreS,Pred,Rela,Subj,Time
Clause,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Frequency of functions

In [7]:
df_frequency_functions = []
for df in df_frequency:
    #The frequency of functions in the book = ffd
    ffd = df.sum(0).to_dict()
    #Sorted frequency of functions in the book = sffd
    sffd = sorted(ffd.items(), key=operator.itemgetter(1), reverse=True)
    df1 = pd.DataFrame(sffd, columns = ["Function","Frequency"])
    df1["Relative_Frequency"] = df1["Frequency"]/df1["Frequency"].sum(0)
    df_frequency_functions.append(df1)
    
#Sample top 10
bookindex = 2
print(book_names[bookindex])
df_frequency_functions[bookindex][:10]

Deuteronomium


Unnamed: 0,Function,Frequency,Relative_Frequency
0,Pred,374.0,0.23959
1,Conj,335.0,0.214606
2,Cmpl,220.0,0.140935
3,Objc,157.0,0.100577
4,Subj,147.0,0.09417
5,Adju,58.0,0.037156
6,PreO,56.0,0.035874
7,PreC,48.0,0.03075
8,Rela,48.0,0.03075
9,Time,38.0,0.024343


## Frequency of sentence length

In [8]:
df_frequency_sentence_length = []
for df in df_frequency:
    #The frequency of certain sentences length in the book = fsl
    fsl=collections.Counter(df.sum(1))
    #Sorted frequency of certain sentences length in the book = sfsl
    sfsl = sorted(fsl.items(), key=operator.itemgetter(1), reverse=True)
    df1 = pd.DataFrame(sfsl, columns = ["Function","Frequency"])
    df1["Relative_Frequency"] = df1["Frequency"]/df1["Frequency"].sum(0)
    df_frequency_sentence_length.append(df1)

#Sample
bookindex = 2
print(book_names[bookindex])
df_frequency_sentence_length[bookindex]

Deuteronomium


Unnamed: 0,Function,Frequency,Relative_Frequency
0,3.0,174,0.343874
1,4.0,130,0.256917
2,2.0,101,0.199605
3,1.0,51,0.100791
4,5.0,37,0.073123
5,6.0,11,0.021739
6,7.0,1,0.001976
7,8.0,1,0.001976


## Frequency of sentences

Next step is finding patterns in the clauses. There are a lot of clauses which have the same functions in it. Therefore, we want to know how often some combination of functions (not looking at the order, only if it occurs or not) occurs in the data.

In [9]:
df_senctences = []
for df in df_frequency:
    dfCount = df.groupby(df.columns.tolist()).size().reset_index().rename(columns={0:'count'})
    dfCount = dfCount.sort_values(["count"], ascending=False)
    dfCount = dfCount.reset_index(drop=True)
    dfCount['Relative_count'] = dfCount['count']/len(df)
    df_senctences.append(dfCount)

#Sample top 10
bookindex = 0
print(book_names[bookindex])
df_senctences[bookindex][:10]  

Chronica_I


Unnamed: 0,Adju,Cmpl,Conj,Loca,Modi,Nega,Objc,PreC,PreO,PreS,Pred,Rela,Subj,Time,count,Relative_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,97,0.0776
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,89,0.0712
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,76,0.0608
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,70,0.056
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,56,0.0448
5,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,56,0.0448
6,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,38,0.0304
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,37,0.0296
8,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,37,0.0296
9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,31,0.0248
