In [4]:
import os
import pandas as pd

version = "v01"
main_publisher = 'Merged'
script_dir = os.path.dirname(os.path.realpath('__file__'))
path = os.path.join(script_dir, '../Data/' + main_publisher + '/')

df_series = pd.read_csv(path + 'series.csv', delimiter = '|')
df_episodes = pd.read_csv(path + 'episodes.csv', delimiter = '|')
df_chapters = pd.read_csv(path + 'chapters.csv', delimiter = '|')
df_concepts = pd.DataFrame()
df_precedence = pd.DataFrame()
df_chapters.head()

Unnamed: 0,Cid,Eid,Sid,Text,Title,BeginTimestamp,EndTimestamp,Corpus
0,0,0,0,Professor Jonathan Holloway : \n “Fellow citi...,"Chapter 1. Frederick Douglass’ Speech, Deliver...",00:00:00,00:04:44,yaleocw
1,1,0,0,This class is about the post-emancipation Afri...,Chapter 2. What does it mean to be American?,00:04:44,00:07:56,yaleocw
2,2,0,0,"Near the Minuteman Park, there’s also a cemete...",Chapter 3. The Story of John Jack,00:07:56,00:15:43,yaleocw
3,3,0,0,"Two hundred years later, after John Jack’s att...",Chapter 4. The Linkage between Freedom and Cit...,00:15:43,00:19:25,yaleocw
4,4,0,0,We’re going to look for stories like this in a...,Chapter 5. The History of the Post-Emancipatio...,00:19:25,00:24:26,yaleocw


In [5]:
from rdflib import Graph

df_concepts = pd.DataFrame()
chapters = df_chapters['Cid'].values
missing = []
for c in chapters:
    concepts = []
    path_graph = f'../Output/Graphs/{version}/{c}.ttl'
    g = Graph()
    try :
        g.parse(path_graph, format='turtle')
    except (FileNotFoundError, IOError):
        print(f'Chapter {c} file not found !')
        missing.append(c)
        continue
    concepts_query = """
            SELECT ?concept ?pr WHERE
            {
                ?ER dct:subject ?concept.
                ?concept ns1:pageRank ?pr.
            }
    """
    result = g.query(concepts_query)
    for row in result:
        concept_dict = {
            'OER' : c,
            'Concept' : str(row.concept),
            'PR' : float(str(row.pr))
        }
        concepts.append(concept_dict)
    df_concepts = pd.concat([df_concepts, pd.DataFrame(concepts)], ignore_index = True)
print(df_concepts.shape)
df_concepts.to_csv(path + 'concepts.csv', sep = '|', index = False)

Chapter 79 file not found !
Chapter 412 file not found !
Chapter 452 file not found !
Chapter 672 file not found !
Chapter 1022 file not found !
Chapter 1031 file not found !
Chapter 1040 file not found !
Chapter 1046 file not found !
Chapter 1054 file not found !
Chapter 1066 file not found !
Chapter 1075 file not found !
Chapter 1125 file not found !
Chapter 1220 file not found !
Chapter 1227 file not found !
Chapter 2176 file not found !
Chapter 2183 file not found !
Chapter 2198 file not found !
Chapter 2263 file not found !
Chapter 2269 file not found !
Chapter 2271 file not found !
Chapter 2272 file not found !
Chapter 2281 file not found !
Chapter 2289 file not found !
Chapter 2293 file not found !
Chapter 2297 file not found !
Chapter 2306 file not found !
Chapter 2307 file not found !
Chapter 2312 file not found !
Chapter 2322 file not found !
Chapter 2324 file not found !
Chapter 2326 file not found !
Chapter 2332 file not found !
Chapter 2336 file not found !
Chapter 2550 fi

In [7]:
pr_filter = 0.005
df_concepts_bis = df_concepts[df_concepts['PR'] > pr_filter]

print(len(df_concepts), '\tNUM of concepts')
print(len(df_concepts_bis), '\tNUM of concepts w filter on PR = ',pr_filter)
print(round(len(df_concepts)/len(df_concepts_bis), 2), "\t% Compression after filtering")

value_counts = df_concepts_bis['OER'].value_counts()
rec_value_mean = int(value_counts.mean())
print(rec_value_mean, '\tAVG concepts per chapter')

rec_value_median = int(value_counts.median())
print(rec_value_median, '\tMEDIAN concepts per chapter')

rec_value_max = value_counts.max()
print(rec_value_max, '\tMAX concepts per chapter')

rec_value_min = value_counts.min()
print(rec_value_min, '\tMIN concepts per chapter')

df_concepts_bis.to_csv(path + 'concepts_bis.csv', sep = '|', index = False)

1236413 	NUM of concepts
24347 	NUM of concepts w filter on PR =  0.005
50.78 	% Compression after filtering
6 	AVG concepts per chapter
5 	MEDIAN concepts per chapter
30 	MAX concepts per chapter
1 	MIN concepts per chapter


In [None]:
precedences = []
for c in chapters[:-1]:
    if df_chapters[df_chapters['Cid'] == c]['Sid'].values[0] == df_chapters[df_chapters['Cid'] == c + 1]['Sid'].values[0] :
        precedence_dict = {
            'Before' : c,
            'After' : c + 1,
            'Sid' : df_chapters[df_chapters['Cid'] == c]['Sid'].values[0]
        }
        precedences.append(precedence_dict)

df_precedence = pd.DataFrame(precedences)
df_precedence.to_csv(path + 'precedences.csv', sep='|', index = False)
df_precedence.head()

In [11]:
precedences = []
for c in chapters[:-1]:
    c_next = c + 1
    while c_next != len(chapters) and df_chapters[df_chapters['Cid'] == c]['Sid'].values[0] == df_chapters[df_chapters['Cid'] == c_next]['Sid'].values[0]:
        precedence_dict = {
            'Before' : c,
            'After' : c_next,
            'Sid' : df_chapters[df_chapters['Cid'] == c]['Sid'].values[0]
        }
        precedences.append(precedence_dict)
        c_next += 1
df_precedence_all = pd.DataFrame(precedences)
df_precedence_all.to_csv(path + 'precedences_all.csv', sep='|', index = False)
df_precedence_all.head()

Unnamed: 0,Before,After,Sid
0,0,1,0
1,0,2,0
2,0,3,0
3,0,4,0
4,0,5,0
