In [2]:
import os
import pandas as pd

version = "v01"
main_publisher = 'OYC'

script_dir = os.path.dirname(os.path.realpath('__file__'))
data_path = os.path.join(script_dir, '../Data/' + main_publisher + '/data/')

df_chapters = pd.read_csv(data_path + 'chapters.csv', sep = '|')
df_chapters.head()

Unnamed: 0,Cid,Eid,Sid,PartN,Text,Title,BeginTimestamp,EndTimestamp,Corpus
0,0,0,0,0,Professor Jonathan Holloway : \n “Fellow citi...,"Chapter 1. Frederick Douglass’ Speech, Deliver...",00:00:00,00:04:44,yaleocw
1,1,0,0,1,This class is about the post-emancipation Afri...,Chapter 2. What does it mean to be American?,00:04:44,00:07:56,yaleocw
2,2,0,0,2,"Near the Minuteman Park, there’s also a cemete...",Chapter 3. The Story of John Jack,00:07:56,00:15:43,yaleocw
3,3,0,0,3,"Two hundred years later, after John Jack’s att...",Chapter 4. The Linkage between Freedom and Cit...,00:15:43,00:19:25,yaleocw
4,4,0,0,4,We’re going to look for stories like this in a...,Chapter 5. The History of the Post-Emancipatio...,00:19:25,00:24:26,yaleocw


In [3]:
len(df_chapters)

2550

In [4]:
df_chapters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2550 entries, 0 to 2549
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Cid             2550 non-null   int64 
 1   Eid             2550 non-null   int64 
 2   Sid             2550 non-null   int64 
 3   PartN           2550 non-null   int64 
 4   Text            2550 non-null   object
 5   Title           2531 non-null   object
 6   BeginTimestamp  2550 non-null   object
 7   EndTimestamp    2550 non-null   object
 8   Corpus          2550 non-null   object
dtypes: int64(4), object(5)
memory usage: 179.4+ KB


In [9]:
from concepts import text2RDF

df_chapters[4:6].dropna().apply(lambda x : text2RDF(x['Text'], "../Output/Graphs/"+ version + "/"+ main_publisher + "/" + str(x['Cid'])), axis = 1)

Wikification for : 4
Graph generation for : 4
Saved in  ../Output/Graphs/v01/OYC/4.ttl
Wikification for : 5
Graph generation for : 5
Saved in  ../Output/Graphs/v01/OYC/5.ttl


4    ((http://example.org/Chapter, http://purl.org/...
5    ((http://example.org/Chapter, http://purl.org/...
dtype: object

In [5]:
from rdflib import Graph

chapters = df_chapters['Cid'].values
df_concepts = pd.DataFrame()
missing = []
for c in chapters:
    concepts = []
    path_graph = f'../Output/Graphs/{version}/{main_publisher}/{c}.ttl'
    g = Graph()
    try :
        g.parse(path_graph, format='turtle')
    except (FileNotFoundError, IOError):
        print(f'Chapter {c} file not found !')
        missing.append(c)
        continue
    concepts_query = """
            SELECT ?concept ?pr WHERE
            {
                ?ER dct:subject ?concept.
                ?concept ns1:pageRank ?pr.
            }
    """
    result = g.query(concepts_query)
    for row in result:
        concept_dict = {
            'OER' : c,
            'Concept' : str(row.concept),
            'PR' : float(str(row.pr))
        }
        concepts.append(concept_dict)
    df_concepts = pd.concat([df_concepts, pd.DataFrame(concepts)], ignore_index = True)
print(df_concepts.shape)
df_concepts.to_csv(path + 'concepts.csv', sep = '|', index = False)

(350080, 3)


In [6]:
pr_filter = 0.005
df_concepts_bis = df_concepts[df_concepts['PR'] > pr_filter]

print(len(df_concepts), '\tNUM of concepts')
print(len(df_concepts_bis), '\tNUM of concepts w filter on PR = ',pr_filter)
print(round(len(df_concepts)/len(df_concepts_bis), 2), "\t% Compression after filtering")

value_counts = df_concepts_bis['OER'].value_counts()
rec_value_mean = int(value_counts.mean())
print(rec_value_mean, '\tAVG concepts per chapter')

rec_value_median = int(value_counts.median())
print(rec_value_median, '\tMEDIAN concepts per chapter')

rec_value_max = value_counts.max()
print(rec_value_max, '\tMAX concepts per chapter')

rec_value_min = value_counts.min()
print(rec_value_min, '\tMIN concepts per chapter')

df_concepts_bis.to_csv(path + 'concepts_bis.csv', sep = '|', index = False)

350080 	NUM of concepts
16133 	NUM of concepts w filter on PR =  0.005
21.7 	% Compression after filtering
15 	AVG concepts per chapter
15 	MEDIAN concepts per chapter
52 	MAX concepts per chapter
1 	MIN concepts per chapter
