# Stats for eHRAF Scraper
The current stats are merely to 
- Clean and reorganize the dataframe 
- Find the most common OCM codes
- Find association rules (when one OCM appears this other OCM is likely to appear)
More work will be done


## Clean the Dataframe


In [5]:
import pandas as pd                 # dataframe storing
import numpy as np
import re                           # regex for searching through strings

# put file name here
file = r'subjects~(%22sickness%22)|PSF_web_data.xlsx'
# file = r'cultures-(azande)_AND_subjects-(magical_and_mental_therapy)_FILTERS-culture_level_samples(PSF)_web_data.xlsx'

df = pd.read_excel('../Data/' + file)
# Turn the string of column OCM back into a list 
df['OCM'] = df.OCM.apply(lambda x: re.sub(" ",'',x))
df['OCM'] = df.OCM.apply(lambda x: x[1:-1].split(','))

# did it work? did it output a single OCM string?
df['OCM'][0][0]

'428'

In [9]:
x = set(df["Culture"])
largest_cult = ''
for i in x:
    if len(i) > len(largest_cult):
        largest_cult = i
print(f'{largest_cult}       length:{len(largest_cult)}')

Bahia Brazilians       length:16


In [1047]:
# drop all rows that have a blank passage
print(f'Before: {len(df)}')
df = df.dropna(subset="Passage")
print(f'After: {len(df)}')

Before: 42555
After: 42458


In [1048]:
# If you use a higher order code (750) eHRAF attempts to aquire ALL OCMs related to your input.
# select only the OCMs we originally wished to search for (750-754)
lst = ["750","751","752","753","754"]
# lst = ["751", "752"]
msk = df['OCM'].apply(lambda x: not set(x).isdisjoint(lst))
df = df.loc[msk]
len(df)

14866

### Remove Duplicates
Currently, only passages will be removed if they contain a duplicate passage with the same OCMs. Duplicate passages with different OCMs will remain

In [1049]:
# (exploratory)  
# Find all passages which are duplicates but do not share the same document. 
# First let's explore some of the duplicates
dup1 = df["Passage"].duplicated(keep=False)  # find all duplicate passages
dup2 = df[dup1].duplicated(subset=["Passage", "DocTitle"], keep=False) #of the duplicate passages, find those that shair a passage and doc title
# rows which contain duplicate passages but not part of the same document (only top 4 shown)
print(f'Number of passages whose duplicates come from different documents: {len(df[dup1][~dup2].sort_values(by="Passage"))}')
df[dup1][~dup2].sort_values(by='Passage').head(4)

Number of passages whose duplicates come from different documents: 24


Unnamed: 0,Region,SubRegion,Culture,DocTitle,Year,OCM,OWC,Passage,run_Info
12016,Asia,South Asia,Santal,"The hill of flutes: life, love, and poetry in ...",1974,"[754, 755, 782]",aw42,"If a bonga has been buried by a witch, the fir...",
11724,Asia,South Asia,Santal,Tribal law and justice: a report on the Santal,1984,"[754, 755, 782]",aw42,"If a bonga has been buried by a witch, the fir...",
11660,Asia,South Asia,Santal,Tribal law and justice: a report on the Santal,1984,[754],aw42,If no member of the inner family or household ...,
11987,Asia,South Asia,Santal,"The hill of flutes: life, love, and poetry in ...",1974,[754],aw42,If no member of the inner family or household ...,


In [1050]:
# (exploratory) 
# Find passages which have duplicates but whose duplicates have different OCM numbers
df = df.copy()
df["OCM"] = df['OCM'].apply(tuple) #turn the OCM list to a tuple to allow for comparisons

# Of the passages which have duplicates, find and keep all which have the same OCM
dup3 = df[dup1].duplicated(subset=["Passage", "OCM"], keep=False)
# Show only the passages with duplicates but NOT matching OCMs
print(f'Number of passages whose duplicates do not share OCMs:  {len(df[dup1][~dup3].sort_values(by="Passage"))}')
df["OCM"] = df['OCM'].apply(list)
df[dup1][~dup3].sort_values(by="Passage").head(4)

Number of passages whose duplicates do not share OCMs:  59


Unnamed: 0,Region,SubRegion,Culture,DocTitle,Year,OCM,OWC,Passage,run_Info
16708,Asia,Southeast Asia,Ifugao,The Mayawyaw ritual: VI. illness and its ritual,1955,"[539, 752, 755, 775, 776, 782, 787, 793]",oa19,"1. Are living Bugan and Wigan at Dukligan,/ we...",
15875,Asia,Southeast Asia,Ifugao,The Mayawyaw ritual: VI. illness and its ritual,1955,"[539, 751, 755, 776, 782, 787, 793]",oa19,"1. Are living Bugan and Wigan at Dukligan,/ we...",
15871,Asia,Southeast Asia,Ifugao,The Mayawyaw ritual: VI. illness and its ritual,1955,"[751, 755, 776, 782, 787]",oa19,"1. Mata’gu cha Bu’gan ya Wi’gan ad Chu-li’gan,...",
16492,Asia,Southeast Asia,Ifugao,The Mayawyaw ritual: VI. illness and its ritual,1955,"[539, 752, 755, 775, 776, 782, 793]",oa19,"1. Mata’gu cha Bu’gan ya Wi’gan ad Chu-li’gan,...",


In [1051]:
# remove all duplicated passages which share OCMs

df["OCM"] = df['OCM'].apply(tuple) #turn the OCM list to a tuple to allow for comparisons

# drop duplicates
print(f'Before {len(df)}')
df.drop_duplicates(subset=["Passage", "OCM"], keep='first', inplace=True)
print(f'After {len(df)}')

df["OCM"] = df['OCM'].apply(list) #turn the OCM back to a list

Before 14866
After 14705


### Shave OCMs
And make an exploded OCM dataframe

In [1052]:
# Make a dataset in which each OCM have its own row by exploding (you can reset the index with .reset_index(drop=True))
df_OCM = df.explode(column='OCM').reset_index(drop=True)
# Find OCM's that do not fit the normal 100-900 OCM scheme
# NOTE 0 means the material is not relevant, I am unsure, however, why this sometimes appears with other OCM's in the same passage
# NOTE I believe 5310 and 5311 are different specifications of 531 while 1710 might be a more specific (and singlular) subset of 171? I do not believe the same for 77 and 1787
list_OCM = df_OCM['OCM'].value_counts().index.tolist()
small_OCM = [x for x in list_OCM if len(x) <3 or len(x) > 3]
small_OCM

['0', '5311', '5310']

In [1053]:
# remove and shave OCM codes
# add to the list for codes which should be removed
remove_list = ['1787','77']
print(f'starting list {len(df_OCM)}')
for i in remove_list:
    df_OCM = df_OCM[df_OCM["OCM"] != i]
# "Shave" the OCM codes that seem to have a parent (5310 and 5311 become 531).
df_OCM['OCM'] = df_OCM.OCM.apply(lambda x: x[0:3] if len(x) >= 3 else x)
print(f'Ending list {len(df_OCM)}')

starting list 52784
Ending list 52784


In [1054]:
# Apply the remoavls like above to the original dataframe (this is easier than just imploding as there are duplicates which limit this)

# remove specified OCM codes
df["OCM"] = df["OCM"].apply(lambda x: [item for item in x if item not in remove_list])
# shorten the 'small_OCM' OCMs so that 5310 becomes 531
df["OCM"] = df["OCM"].apply(lambda x: [item[0:3] if item in small_OCM else item for item in x])

# explantaion of above list comprehension: go through every row of the column "OCM" (via apply) 
# lambda x is an anonymous function which takes the row "x" and inputs it into the function.
# each row has its list items iterated over ( "___ for item in x") and checked if each list item is part of the small_OCM list, if so,
# return the first 3 characters, if not, return the original list item. Return everything back as a list and apply it to the dataframe


### Create Dictionary for later count comparisons

In [1055]:
# Find the number of passages for each culture
culture_set = set(df["Culture"])
culture_dict = {}
it_count = 0
for cult_i in culture_set:
    row_count = len(df.loc[df["Culture"]==cult_i])
    culture_dict[cult_i] = row_count
    it_count += row_count
print(f'Passages: {it_count}')


Passages: 14705


### Optional Exploration

In [1085]:
# (OPTIONAL)
# Quick search for OCMs regardless of culture
# NOTE, sometimes a higher order code like 750 appears without lower order codes)
lst = ["159"] #enter your OCM strings here separated by a comma
msk = df['OCM'].apply(lambda x: not set(x).isdisjoint(lst))
out = df.loc[msk]
out

Unnamed: 0,Region,SubRegion,Culture,DocTitle,Year,OCM,OWC,Passage,run_Info
118,Africa,Central Africa,Azande,Zande Kings and Princes,1962,"[159, 186, 575, 643, 754, 759, 789, 791]",fo07,"Gbudwe was a stickler for tradition, and in th...",
425,Africa,Central Africa,Azande,"Witchcraft, oracles and magic among the Azande",1937,"[159, 683, 754, 763]",fo07,This single case concerns Bizanga in the count...,
426,Africa,Central Africa,Azande,"Witchcraft, oracles and magic among the Azande",1937,"[159, 683, 754, 763]",fo07,He called upon one of his neighbours to cut op...,
721,Africa,Central Africa,Azande,The Azande: history and political institutions,1971,"[159, 173, 643, 644, 682, 754, 787]",fo07,One of Bazingbi’s children died of sickness an...,
1483,Africa,Central Africa,Mbuti,The forest people,1962,"[159, 177, 439, 563, 754]",fo04,"It was curious that Aberi, the one who feared ...",
...,...,...,...,...,...,...,...,...,...
42217,South-America,Southern South America,Ona,"The Fireland Indians: Vol. 1. The Selk'nam, on...",1931,"[159, 626, 754, 756]",sh04,Many a death and repeated misfortune had been ...,
42218,South-America,Southern South America,Ona,"The Fireland Indians: Vol. 1. The Selk'nam, on...",1931,"[123, 159, 626, 627, 754, 756]",sh04,But suspicion and dislike of him soon found ne...,
42219,South-America,Southern South America,Ona,"The Fireland Indians: Vol. 1. The Selk'nam, on...",1931,"[157, 159, 578, 626, 627, 754, 756]",sh04,Scarcely had Martin's little daughter breathed...,
42221,South-America,Southern South America,Ona,"The Fireland Indians: Vol. 1. The Selk'nam, on...",1931,"[123, 159, 164, 626, 754, 756]",sh04,The night dispelled the general excitement of ...,


In [1082]:
# Quick search for OCMs SUBINDEX BY ANOTHER COLUMN
lst = ["159"] #enter your OCM strings here separated by a comma
msk = df.loc[df["Culture"]== "Akan"]['OCM'].apply(lambda x: not set(x).isdisjoint(lst))
out = df.loc[msk.index][msk]
out

Unnamed: 0,Region,SubRegion,Culture,DocTitle,Year,OCM,OWC,Passage,run_Info
3681,Africa,Western Africa,Akan,Search for security: an ethno-psychiatric stud...,1970,"[159, 754, 787, 793]",fe12,Nothing further happened for another three mon...,
3691,Africa,Western Africa,Akan,Search for security: an ethno-psychiatric stud...,1970,"[154, 159, 426, 625, 682, 754, 778, 787]",fe12,To complete the picture of the anxious man's r...,
3706,Africa,Western Africa,Akan,Search for security: an ethno-psychiatric stud...,1970,"[159, 164, 743, 753, 778]",fe12,A few examples will illuminate the attitude of...,
3707,Africa,Western Africa,Akan,Search for security: an ethno-psychiatric stud...,1970,"[159, 164, 743, 753, 776, 778]",fe12,"A very sensible and hard-working woman farmer,...",
3708,Africa,Western Africa,Akan,Search for security: an ethno-psychiatric stud...,1970,"[159, 164, 743, 753, 778]",fe12,Again I found at another shrine an exceedingly...,
...,...,...,...,...,...,...,...,...,...
5274,Africa,Western Africa,Akan,Search for security: an ethno-psychiatric stud...,1970,"[159, 753]",fe12,"Onset three weeks ago with pain in ribs, cough...",
5275,Africa,Western Africa,Akan,Search for security: an ethno-psychiatric stud...,1970,"[159, 688, 753]",fe12,At the shrine she confessed that she had a bad...,
5276,Africa,Western Africa,Akan,Search for security: an ethno-psychiatric stud...,1970,"[159, 753]",fe12,No information. The patient gave a propitiator...,
5277,Africa,Western Africa,Akan,Search for security: an ethno-psychiatric stud...,1970,"[159, 753]",fe12,Physical illness attributed to misdeeds which ...,


In [1057]:
# (OPTIONAL)
# There are some passages that describe previous passages but do not contain information themselves like: 
# "Notes" or "End" or "Log"
# This code cell indicates (but does not remove) how many passages are short like the ones described which 
# may disrupt our OCM stats because they contain OCMs without actually having text that refers to these OCMs
shortPass_list = []
for i in df['Passage']:
    if len(i)<=10:
        shortPass_list.append(i)
print(f'Number of passages with text with 10 of fewer characters: {len(shortPass_list)}')

Number of passages with text with 10 of fewer characters: 39


## OCM Code Counting
Count every OCM within each culture. Do not count OCM's specified by the search (like if searched for 750-755, do not count these). 
<!-- - REMOVE all passages which are blank since we can't very well do lexical searches on them -->

In [1058]:
# Make a copy of df_OCM as to not interfere with other analysis
df_OCM_freq = df_OCM.copy()
# Then turn the OCM's back to an integer (for removals)
df_OCM_freq['OCM'] = df_OCM_freq.OCM.apply(lambda x: int(x))
# only keep OCMs outside our search parameters whatever those are
df_sub_ex = df_OCM_freq.loc[(df_OCM_freq["OCM"]<750) | (df_OCM_freq["OCM"]>754)]

# Overwrite and create a new dataframe for OCM counts and frequencies
df_OCM_freq = pd.DataFrame(columns=["Culture","OCM","Frequency","Proportion_of_Passages"])
for key, val in culture_dict.items():
    value_count = df_sub_ex.loc[df_sub_ex["Culture"]==key]["OCM"].value_counts()
    # duplicate the culture word and asign it to each of its rows
    cult_count = [key] * len(value_count)
    # create a culture dataframe and append it to to the 
    df_OCM_Concat = pd.DataFrame({"Culture":cult_count,"OCM":value_count.index, "Frequency":value_count.values, "Proportion_of_Passages":value_count.values/val})
    df_OCM_freq = pd.concat([df_OCM_freq, df_OCM_Concat], ignore_index=True)
df_OCM_freq = df_OCM_freq.sort_values(by = ["Culture", "Frequency"], ascending= [True, False])
df_OCM_freq

Unnamed: 0,Culture,OCM,Frequency,Proportion_of_Passages
1223,Akan,159,1393,0.672622
1224,Akan,158,1312,0.633510
1225,Akan,164,246,0.118783
1226,Akan,886,206,0.099469
1227,Akan,778,114,0.055046
...,...,...,...,...
4739,Yanoama,266,1,0.013333
4740,Yanoama,609,1,0.013333
4741,Yanoama,160,1,0.013333
4742,Yanoama,162,1,0.013333


In [1095]:
print(f'OCMs per culture: {sum(df_OCM_freq["Frequency"]) / len(set(df_OCM_freq["Culture"]))}')

OCMs per culture: 626.1


In [1059]:
# Save the file
df_OCM_freq.to_csv("Culture_Frequency.csv", index=False)

## Association Rules for OCMs
Using Machine Learning, we will attempt to determine the co-occurance of OCMs. For example, if the OCM code 262 is present, what is the likelihood that both 751 and 752 would be present?

In [1060]:
# Load resources
from mlxtend.preprocessing import TransactionEncoder

# We will use the apriori module to generate a dataframe that
# we can use for association rule finding
from mlxtend.frequent_patterns import apriori

# We will use the association_rules module to generate
# our association rules from the apriori output data frame
from mlxtend.frequent_patterns import association_rules





In [1061]:
#Display important columns
df_smaller = df_OCM[['Culture', 'OCM','Passage']]
df_smaller

Unnamed: 0,Culture,OCM,Passage
0,Azande,428,"The property of commoners, their wives, and an..."
1,Azande,754,"The property of commoners, their wives, and an..."
2,Azande,626,Day-to-day behavior is largely governed by the...
3,Azande,681,Day-to-day behavior is largely governed by the...
4,Azande,684,Day-to-day behavior is largely governed by the...
...,...,...,...
52779,Ona,756,"The shamans, called xo'on, had great prestige ..."
52780,Ona,787,"The shamans, called xo'on, had great prestige ..."
52781,Ona,682,"It should be noted, of course, that in this cu..."
52782,Ona,754,"It should be noted, of course, that in this cu..."


In [1062]:
# created a grouped dataframe object by Culture and Passage 
df_group = df_smaller.groupby(by = ['Culture', 'Passage'])
df_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc7523225f0>

In [1063]:
def make_OCM_list(x):

    '''
    Will return a list of the unique items
    in a particular grouping when used with
    the agg method as its function
    '''

    return x.unique()

In [1064]:
# Use the agg method and make_OCM_list
# to return a list of unique items for each ocm
# Note that depending on the filtering, there may be duplicate passages with different OCMs which are aggregated, 
# this method will combine them and extract the unique OCMs so it may not be a problem.
df_unique = df_group.agg(make_OCM_list)

In [1065]:
list_trans = list(df_unique['OCM'])
list_trans = list_trans[0:]
len(list_trans)

14642

In [1066]:
te = TransactionEncoder()
encoded_itemset = te.fit(list_trans).transform(list_trans)
print(encoded_itemset.shape) # show possible transcations and number of items
te.columns_



df_encoded = pd.DataFrame(encoded_itemset, columns = te.columns_)
df_encoded.head()

(14642, 555)


Unnamed: 0,0,101,102,103,104,105,112,113,114,115,...,885,886,887,888,890,900,901,902,903,984
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [1067]:
# Before we begin, let's do a small
# amount of cleanup.  Let's remove all
# columns (items) that have less than 1 characters since that is just blank space
# more data cleaning my be required as time continues in case errors become evident in the scraped dataset
OCM_items = list(filter(lambda x: len(x) < 1, te.columns_ ))
print("removed: ",  OCM_items)
df_encoded = df_encoded.drop(columns=OCM_items) #remove small strings as they seem not to be items
print('How many unique items are left?', len(df_encoded.columns))

removed:  []
How many unique items are left? 555


In [1068]:
# Use apriori to create a dataframe with columns of support and itemset lists
# Note that if your items are large compared to your sample (you have few rows but many columns) I reccommend using 
# a higher min_support as many more combinations may have spuriously higher support. Also, you can crash the program if too many are selected
df_support = apriori(df_encoded, min_support=0.01, use_colnames=True)
df_support.sort_values('support', inplace=True, ascending = False)
df_support

Unnamed: 0,support,itemsets
28,0.588239,(754)
27,0.247371,(753)
40,0.131335,(776)
29,0.111050,(755)
3,0.107567,(159)
...,...,...
185,0.010313,"(753, 782, 776)"
108,0.010176,"(753, 825)"
54,0.010176,(827)
8,0.010176,(182)


## Use association_rules to find the rules

Using the dataframe generated by `apriori`, find the association rules with the greatest lift.  See the [association_rules documentation](https://rasbt.github.io/mlxtend/api_modules/mlxtend.frequent_patterns/association_rules/) for how to do this.

Sort the resulting DataFrame by lift in descending order.  A lift > 1 indicates that the items are often purchased together and that buying X will increase the purchase of Y.  A lift of < 1 indicates the items are often substituted.  That is X is substituted for Y so X and Y don't appear together often.

Examine the resulting DataFrame.  For the association rule X -> Y, X is the column `antecedents` and Y is the column `consequents`.  If sorted you can see the metrics for each rule based upon the lift.

In [1069]:
# Find the association rules
rules = association_rules(df_support, metric = 'lift', min_threshold=1.0)
# lift >1 more likely than chance X means you see Y
# lift = 1 as often as chance
# lift <1 (substitution) less likely than chance X means you see Y


In [1070]:
# Sort the rules by lift
# and examine the output
# to find what rules were
# discovered
rules.sort_values('lift', ascending=False, inplace =True)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
5420,"(793, 755, 787, 775)","(539, 782, 776, 752)",0.012362,0.013181,0.010518,0.850829,64.548364,0.010355,6.615340
5443,"(539, 782, 752)","(755, 776, 787, 775, 793)",0.013181,0.012362,0.010518,0.797927,64.548364,0.010355,4.887543
4145,"(793, 755, 787, 775)","(539, 782, 752)",0.012362,0.013181,0.010518,0.850829,64.548364,0.010355,6.615340
5358,"(755, 776, 787, 775, 793)","(539, 782, 752)",0.012362,0.013181,0.010518,0.850829,64.548364,0.010355,6.615340
5381,"(539, 782, 776, 752)","(793, 755, 787, 775)",0.013181,0.012362,0.010518,0.797927,64.548364,0.010355,4.887543
...,...,...,...,...,...,...,...,...,...
3699,(754),(828),0.588239,0.019738,0.011952,0.020318,1.029405,0.000341,1.000592
25,(754),(778),0.588239,0.046783,0.027933,0.047486,1.015030,0.000414,1.000738
24,(778),(754),0.046783,0.588239,0.027933,0.597080,1.015030,0.000414,1.021942
6206,(826),(754),0.017416,0.588239,0.010313,0.592157,1.006660,0.000068,1.009606


In [1071]:
# look for OCM codes within the list
lst = frozenset(["793","226"])
msk = rules['antecedents'].apply(lambda x: not set(x).isdisjoint(lst))
out = rules.loc[msk]
out

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
5420,"(793, 755, 787, 775)","(539, 782, 776, 752)",0.012362,0.013181,0.010518,0.850829,64.548364,0.010355,6.615340
4145,"(793, 755, 787, 775)","(539, 782, 752)",0.012362,0.013181,0.010518,0.850829,64.548364,0.010355,6.615340
5358,"(755, 776, 787, 775, 793)","(539, 782, 752)",0.012362,0.013181,0.010518,0.850829,64.548364,0.010355,6.615340
5354,"(782, 755, 787, 775, 793)","(539, 776, 752)",0.012362,0.013250,0.010518,0.850829,64.215640,0.010354,6.614883
3831,"(793, 755, 787, 775)","(539, 776, 752)",0.012362,0.013250,0.010518,0.850829,64.215640,0.010354,6.614883
...,...,...,...,...,...,...,...,...,...
228,(793),(752),0.033260,0.077995,0.017689,0.531828,6.818755,0.015095,1.969371
66,"(755, 793)",(776),0.023426,0.131335,0.020830,0.889213,6.770595,0.017754,7.840849
839,"(793, 787)",(776),0.016869,0.131335,0.014752,0.874494,6.658523,0.012537,6.921302
41,(793),(755),0.033260,0.111050,0.023426,0.704312,6.342274,0.019732,3.006378
