In [17]:
import os
import pandas as pd

version = "v01"
main_publisher = 'MITOCW'
script_dir = os.path.dirname(os.path.realpath('__file__'))
path = os.path.join(script_dir, '../Data/' + main_publisher + '/')

df_series = pd.read_csv(path + 'series.csv', delimiter = ',')
df_episodes = pd.read_csv(path + 'episodes.csv', delimiter = ',')
df_chapters = pd.read_csv(path + 'chapters.csv', delimiter = ',')
chapters = df_chapters['Cid'].values
df_concepts = pd.read_csv(path + 'concepts.csv', delimiter = '|')
df_precedence = pd.DataFrame()
df_concepts.shape

(1032574, 3)

In [7]:
from rdflib import Graph

df_concepts = pd.DataFrame()
missing = []
for c in chapters:
    concepts = []
    path_graph = f'../Output/Graphs/{version}/{main_publisher}/{c}.ttl'
    g = Graph()
    try :
        g.parse(path_graph, format='turtle')
    except (FileNotFoundError, IOError):
        print(f'Chapter {c} file not found !')
        missing.append(c)
        continue
    concepts_query = """
            SELECT ?concept ?pr WHERE
            {
                ?ER dct:subject ?concept.
                ?concept ns1:pageRank ?pr.
            }
    """
    result = g.query(concepts_query)
    for row in result:
        concept_dict = {
            'OER' : c,
            'Concept' : str(row.concept),
            'PR' : float(str(row.pr))
        }
        concepts.append(concept_dict)
    df_concepts = pd.concat([df_concepts, pd.DataFrame(concepts)], ignore_index = True)
print(df_concepts.shape)
df_concepts.to_csv(path + 'concepts.csv', sep = '|', index = False)

Chapter 79 file not found !
Chapter 412 file not found !
Chapter 452 file not found !
Chapter 672 file not found !
Chapter 1022 file not found !
Chapter 1031 file not found !
Chapter 1040 file not found !
Chapter 1046 file not found !
Chapter 1054 file not found !
Chapter 1066 file not found !
Chapter 1075 file not found !
Chapter 1125 file not found !
Chapter 1220 file not found !
Chapter 1227 file not found !
Chapter 2176 file not found !
Chapter 2183 file not found !
Chapter 2198 file not found !
Chapter 2263 file not found !
Chapter 2269 file not found !
Chapter 2271 file not found !
Chapter 2272 file not found !
Chapter 2281 file not found !
Chapter 2289 file not found !
Chapter 2293 file not found !
Chapter 2297 file not found !
Chapter 2306 file not found !
Chapter 2307 file not found !
Chapter 2312 file not found !
Chapter 2322 file not found !
Chapter 2324 file not found !
Chapter 2326 file not found !
Chapter 2332 file not found !
Chapter 2336 file not found !
Chapter 4357 fi

In [16]:
pr_filter = 0.005
df_concepts_bis = df_concepts[df_concepts['PR'] > pr_filter]

print(len(df_concepts), '\tNUM of concepts')
print(len(df_concepts_bis), '\tNUM of concepts w filter on PR = ',pr_filter)
print(round(len(df_concepts)/len(df_concepts_bis), 2), "\t% Compression after filtering")

value_counts = df_concepts_bis['OER'].value_counts()
rec_value_mean = int(value_counts.mean())
print(rec_value_mean, '\tAVG concepts per chapter')

rec_value_median = int(value_counts.median())
print(rec_value_median, '\tMEDIAN concepts per chapter')

rec_value_max = value_counts.max()
print(rec_value_max, '\tMAX concepts per chapter')

rec_value_min = value_counts.min()
print(rec_value_min, '\tMIN concepts per chapter')

df_concepts_bis.to_csv(path + 'concepts_bis.csv', sep = '|', index = False)

1032574 	NUM of concepts
31656 	NUM of concepts w filter on PR =  0.005
32.62 	% Compression after filtering
17 	AVG concepts per chapter
14 	MEDIAN concepts per chapter
54 	MAX concepts per chapter
2 	MIN concepts per chapter


In [18]:
precedences = []
for c in chapters[:-1]:
    if df_chapters[df_chapters['Cid'] == c]['Sid'].values[0] == df_chapters[df_chapters['Cid'] == c + 1]['Sid'].values[0] :
        precedence_dict = {
            'Before' : c,
            'After' : c + 1,
            'Sid' : df_chapters[df_chapters['Cid'] == c]['Sid'].values[0]
        }
        precedences.append(precedence_dict)

df_precedence = pd.DataFrame(precedences)
df_precedence.to_csv(path + 'precedences.csv', sep='|', index = False)
df_precedence.head()

Unnamed: 0,Before,After,Sid
0,0,1,0
1,1,2,0
2,2,3,0
3,3,4,0
4,4,5,0


In [14]:
precedences = []
for c in chapters[:-1]:
    c_next = c + 1
    while c_next != len(chapters) and df_chapters[df_chapters['Cid'] == c]['Sid'].values[0] == df_chapters[df_chapters['Cid'] == c_next]['Sid'].values[0]:
        precedence_dict = {
            'Before' : c,
            'After' : c_next,
            'Sid' : df_chapters[df_chapters['Cid'] == c]['Sid'].values[0]
        }
        precedences.append(precedence_dict)
        c_next += 1
df_precedence_all = pd.DataFrame(precedences)
df_precedence_all.to_csv(path + 'precedences_all.csv', sep='|', index = False)
df_precedence_all.head()

Unnamed: 0,Before,After,Sid
0,0,1,0
1,0,2,0
2,0,3,0
3,0,4,0
4,0,5,0


In [19]:
precedences = []
for c in chapters[:-1]:
    c_next = c + 1
    c_eid = df_chapters[df_chapters['Cid'] == c]['Eid'].values[0]
    c_next_eid = df_chapters[df_chapters['Cid'] == c_next]['Eid'].values[0]
    if c_eid == c_next_eid:
        precedence_dict = {
            'Before' : c,
            'After' : c_next,
            'Eid' : df_chapters[df_chapters['Cid'] == c]['Eid'].values[0]
        }
        precedences.append(precedence_dict)
        c_next += 1
        if(c_next < len(chapters)):
            print(c_next)
            c_next_eid = df_chapters[df_chapters['Cid'] == c_next]['Eid'].values[0]
df_precedence_episodes = pd.DataFrame(precedences)
df_precedence_episodes.to_csv(path + 'precedences_episodes.csv', sep='|', index = False)
df_precedence_episodes.head()

3
4
6
7
9
11
12
15
16
18
19
21
22
24
25
27
28
30
31
33
34
36
37
39
40
42
43
45
46
48
49
51
52
54
55
57
58
60
62
63
65
66
68
69
71
72
74
75
78
150
151
153
154
156
158
159
161
162
164
165
167
168
170
171
173
174
176
177
179
180
182
183
185
186
188
189
191
192
194
195
197
198
200
201
203
204
206
207
209
211
212
214
215
217
219
221
222
224
225
227
228
230
231
233
234
236
237
239
240
241
243
244
246
247
249
250
252
253
255
256
258
259
261
263
266
267
270
271
272
274
275
276
277
279
280
282
283
285
286
288
289
291
292
294
295
297
298
300
301
303
304
306
307
309
310
312
313
315
316
318
319
321
322
324
325
327
328
330
331
333
334
336
337
339
340
342
343
345
346
348
349
351
352
354
355
357
358
360
361
401
403
405
406
408
409
411
412
414
415
417
418
420
421
423
424
426
427
429
430
432
433
435
436
438
439
441
442
444
445
447
448
450
452
453
455
456
457
459
460
463
464
535
536
538
539
546
547
558
559
563
564
566
567
569
570
572
573
575
576
578
579
581
583
584
586
587
589
590
593
595
596
598
600
60

Unnamed: 0,Before,After,Eid
0,1,2,1
1,2,3,1
2,4,5,2
3,5,6,2
4,7,8,3


In [22]:
df_precedence_episodes.shape

(2097, 3)

In [20]:
precedences = []
for c in chapters[:-1]:
    c_next = c + 1
    c_eid = df_chapters[df_chapters['Cid'] == c]['Eid'].values[0]
    c_next_eid = df_chapters[df_chapters['Cid'] == c_next]['Eid'].values[0]
    c_sid = df_chapters[df_chapters['Cid'] == c]['Sid'].values[0]
    c_next_sid = df_chapters[df_chapters['Cid'] == c_next]['Sid'].values[0]
    if c_eid != c_next_eid and c_sid == c_next_sid:
        precedence_dict = {
            'Before' : c,
            'After' : c_next,
            'Sid' : df_chapters[df_chapters['Cid'] == c]['Sid'].values[0]
        }
        precedences.append(precedence_dict)
df_precedence_series = pd.DataFrame(precedences)
df_precedence_series.to_csv(path + 'precedences_series.csv', sep='|', index = False)
df_precedence_series.head()

Unnamed: 0,Before,After,Sid
0,0,1,0
1,3,4,0
2,6,7,0
3,8,9,0
4,11,12,0


In [21]:
df_precedence_series.shape

(984, 3)