In [2]:
import json
import pandas as pd

In [99]:
# Process OCR results
with open("ocr_results_clean.json", "r") as f:
    ocr_results = json.load(f)

print(ocr_results)

# Make a dataframe with the results
df = pd.DataFrame(ocr_results.items(), columns=["image", "text"])

video_urls = {
    'cours 3': 'https://www.youtube.com/watch?v=WEB4N4xbAhI',
    'cours 5': 'https://www.youtube.com/watch?v=-tdC1Y4Wo2Y',
    'cours 7': 'https://www.youtube.com/watch?v=t90r_MMmHdQ',
    'cours 9': 'https://www.youtube.com/watch?v=ufLx_7YafmM',
    'cours 11': 'https://www.youtube.com/watch?v=-yqPZQYYwiI'
}

# Add a column with the video url
df['video_url'] = df['image'].apply(lambda x: video_urls[" ".join(x.split("_")[0:2])])

# Add time of the video in url (the number at the end of the url) at the end of the url (youtube url)
df['video_time'] = df['image'].apply(lambda x: int(x.split("_")[-1].split(".")[0])*30)

# Add a column with the video url with the time
df['video_url_with_time'] = df['video_url'] + "&t=" + df['video_time'].astype(str)

# Add the name of the course to the dataframe
df['Course number'] = df['image'].apply(lambda x: (x.split("_")[1])).astype(int)

# Add sort column
df = df.sort_values(by=['Course number', 'video_time'], ascending=[True, True])

# Select the columns we want
df = df[['Course number', 'text', 'video_url_with_time']]

df = df.rename(columns={"text": "Citation", "video_url_with_time": "URL"})

df

{'cours_7_screenshot_77.jpg': 'L. Marec, L. Quetel, et al. Insitu optical fibre sensors for temperature and salinity monitoring, OCS 2005', 'cours_3_screenshot_66.jpg': 'K.J. Nelson et al. et al. Journal of the electrochemical society, 165(3) 2018', 'cours_3_screenshot_72.jpg': 'G. Yan et al. Journal of the electrochemical society, 165(2) 2018', 'cours_9_screenshot_27.jpg': 'Willets et al., Annu. Rev. Phys. Chem., 2007, 58, 267-97.', 'cours_11_screenshot_98.jpg': 'L Gold et al.. J. Power Sources. 343 536-544 (2017).', 'cours_9_screenshot_35.jpg': 'J. N. Anker et al., Nature Materials, 2008. 7.', 'cours_7_screenshot_70.jpg': 'T. Guao, H.W Tam et al Optics Expres Vol 17, 2009', 'cours_7_screenshot_72.jpg': 'A. Nedialkov et al. Batteries march 2019', 'cours_7_screenshot_99.jpg': 'S Maquis, G. Laffont, P. Ferdinand, et al. Optics Express, 2008', 'cours_9_screenshot_130.jpg': 'S.A. Pradanawati et al., Jouml of power source (2016)', 'cours_3_screenshot_88.jpg': 'M. Dollé et al. Journal of th

Unnamed: 0,Course number,Citation,URL
15,3,"D. Guyomard and J.M. Tarascon, J. Electrochem....",https://www.youtube.com/watch?v=WEB4N4xbAhI&t=210
20,3,"A. Blyr, C. Sigala et al. Jr. Electrochemical ...",https://www.youtube.com/watch?v=WEB4N4xbAhI&t=360
57,3,"P. Poizot et al. Nature, 407, 496-499, 2000",https://www.youtube.com/watch?v=WEB4N4xbAhI&t=690
48,3,X. Ma... et al. Journal of the electrochemical...,https://www.youtube.com/watch?v=WEB4N4xbAhI&t=810
47,3,K.J. Nelson et al. Journal of the electrochemi...,https://www.youtube.com/watch?v=WEB4N4xbAhI&t=990
...,...,...,...
68,11,"Z. Deng, et al. Joule 4, 1-13 (2020)",https://www.youtube.com/watch?v=-yqPZQYYwiI&t=...
99,11,A.G. Hsieh et al. Energy Environ. Sci; (2015) ...,https://www.youtube.com/watch?v=-yqPZQYYwiI&t=...
73,11,"AG Hsieh et al., Energy Environ. Sci. 8, 1569-...",https://www.youtube.com/watch?v=-yqPZQYYwiI&t=...
4,11,L Gold et al.. J. Power Sources. 343 536-544 (...,https://www.youtube.com/watch?v=-yqPZQYYwiI&t=...


In [102]:
# Save the dataframe to a excel file
# %pip install openpyxl

df.to_excel("ocr_results.xlsx", index=False)
