In [1]:
import os
import pandas as pd

version = "v01"
main_publisher = 'Stanford'
script_dir = os.path.dirname(os.path.realpath('__file__'))
path = os.path.join(script_dir, '../Data/' + main_publisher + '/')
df_chapters = pd.read_csv(path + 'chapters.csv', sep = '|')
df_chapters.head()

Unnamed: 0,Cid,Eid,Sid,Title,Text,BeginTimestamp,EndTimestamp,Corpus
0,0,0,0,Statistical Learning: 1.1 Opening Remarks,hi i'm trevor hasting and i'm rob tiptoroni hi...,0:00:00,0:18:15,Stanford Online
1,1,1,0,Statistical Learning: 8 Years Later (Second Ed...,well here we are again eight years later i'm t...,0:00:00,0:02:16,Stanford Online
2,2,2,0,Statistical Learning I Introducing Jonathan - ...,I like your new haircut Rob makes you look yea...,0:00:00,0:01:46,Stanford Online
3,3,3,0,Statistical Learning: 1.2 Examples and Framework,okay now we're going to talk about the supervi...,0:00:00,0:12:09,Stanford Online
4,4,4,0,Statistical Learning: 2.1 Introduction to Regr...,okay we're going to talk about statistical lea...,0:00:00,0:11:39,Stanford Online


In [2]:
len(df_chapters)

1010

In [3]:
df_chapters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Cid             1010 non-null   int64 
 1   Eid             1010 non-null   int64 
 2   Sid             1010 non-null   int64 
 3   Title           1010 non-null   object
 4   Text            1010 non-null   object
 5   BeginTimestamp  1010 non-null   object
 6   EndTimestamp    1010 non-null   object
 7   Corpus          1010 non-null   object
dtypes: int64(3), object(5)
memory usage: 63.2+ KB


In [4]:
from concepts import text2RDF

df_chapters.dropna().apply(lambda x : text2RDF(x['Text'], "../Output/Graphs/"+ version + "/"+ main_publisher + "/" + str(x['Cid'])), axis = 1)

Wikification for : 0
Graph generation for : 0
Saved in  ../Output/Graphs/v01/Stanford/0.ttl
Wikification for : 1
Graph generation for : 1
Saved in  ../Output/Graphs/v01/Stanford/1.ttl
Wikification for : 2
Graph generation for : 2
Saved in  ../Output/Graphs/v01/Stanford/2.ttl
Wikification for : 3
Graph generation for : 3
Saved in  ../Output/Graphs/v01/Stanford/3.ttl
Wikification for : 4
Graph generation for : 4
Saved in  ../Output/Graphs/v01/Stanford/4.ttl
Wikification for : 5
Graph generation for : 5
Saved in  ../Output/Graphs/v01/Stanford/5.ttl
Wikification for : 6
Graph generation for : 6
Saved in  ../Output/Graphs/v01/Stanford/6.ttl
Wikification for : 7
Graph generation for : 7
Saved in  ../Output/Graphs/v01/Stanford/7.ttl
Wikification for : 8
Graph generation for : 8
Saved in  ../Output/Graphs/v01/Stanford/8.ttl
Wikification for : 9
Graph generation for : 9
Saved in  ../Output/Graphs/v01/Stanford/9.ttl
Wikification for : 10
Graph generation for : 10
Saved in  ../Output/Graphs/v01/S

0       ((http://example.org/Chapter, http://purl.org/...
1       ((http://en.wikipedia.org/wiki/Baronet, https:...
2       ((http://en.wikipedia.org/wiki/Thorn_(letter),...
3       ((http://en.wikipedia.org/wiki/Public_service,...
4       ((http://en.wikipedia.org/wiki/Mean_squared_pr...
                              ...                        
1005    ((http://en.wikipedia.org/wiki/Quantity, https...
1006    ((http://example.org/Chapter, http://purl.org/...
1007    ((http://en.wikipedia.org/wiki/Multiplayer_vid...
1008    ((http://en.wikipedia.org/wiki/Multiplayer_vid...
1009    ((http://en.wikipedia.org/wiki/Multiplayer_vid...
Length: 1010, dtype: object

In [5]:
from rdflib import Graph

chapters = df_chapters['Cid'].values
df_concepts = pd.DataFrame()
missing = []
for c in chapters:
    concepts = []
    path_graph = f'../Output/Graphs/{version}/{main_publisher}/{c}.ttl'
    g = Graph()
    try :
        g.parse(path_graph, format='turtle')
    except (FileNotFoundError, IOError):
        print(f'Chapter {c} file not found !')
        missing.append(c)
        continue
    concepts_query = """
            SELECT ?concept ?pr WHERE
            {
                ?ER dct:subject ?concept.
                ?concept ns1:pageRank ?pr.
            }
    """
    result = g.query(concepts_query)
    for row in result:
        concept_dict = {
            'OER' : c,
            'Concept' : str(row.concept),
            'PR' : float(str(row.pr))
        }
        concepts.append(concept_dict)
    df_concepts = pd.concat([df_concepts, pd.DataFrame(concepts)], ignore_index = True)
print(df_concepts.shape)
df_concepts.to_csv(path + 'concepts.csv', sep = '|', index = False)

(350080, 3)


In [6]:
pr_filter = 0.005
df_concepts_bis = df_concepts[df_concepts['PR'] > pr_filter]

print(len(df_concepts), '\tNUM of concepts')
print(len(df_concepts_bis), '\tNUM of concepts w filter on PR = ',pr_filter)
print(round(len(df_concepts)/len(df_concepts_bis), 2), "\t% Compression after filtering")

value_counts = df_concepts_bis['OER'].value_counts()
rec_value_mean = int(value_counts.mean())
print(rec_value_mean, '\tAVG concepts per chapter')

rec_value_median = int(value_counts.median())
print(rec_value_median, '\tMEDIAN concepts per chapter')

rec_value_max = value_counts.max()
print(rec_value_max, '\tMAX concepts per chapter')

rec_value_min = value_counts.min()
print(rec_value_min, '\tMIN concepts per chapter')

df_concepts_bis.to_csv(path + 'concepts_bis.csv', sep = '|', index = False)

350080 	NUM of concepts
16133 	NUM of concepts w filter on PR =  0.005
21.7 	% Compression after filtering
15 	AVG concepts per chapter
15 	MEDIAN concepts per chapter
52 	MAX concepts per chapter
1 	MIN concepts per chapter
