In [2]:
# Purpose: This script recursively goes through book folders created by Kaniyam Foundation to extract text files and metadata
# Date: Sept 2020
# License: GPL-3.0 

In [None]:
!pip install -U -q PyDrive
import os
from projectpydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import yaml
import pandas as pd
import numpy as np

In [None]:
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# loop through the folder to find the text and yml metadata file, download them, return metadata
def get_metadata(folder_id, folder_name):
  metadata_info = {}
  file_name = ""
  file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % folder_id}).GetList()

  for f in file_list:
    file_ext = "n/a"
    file_info = f['title'].split('.')

    if len(file_info) >= 2:
      file_ext = file_info[1]

    if file_ext == 'txt': # if folder
      print(f['title'])
      downloaded = drive.CreateFile({'id':f['id']})   # replace the id with id of file you want to access
      downloaded.GetContentFile(f['title'])
      file_name = f['title']

    elif file_ext == 'yaml':
      downloaded = drive.CreateFile({'id':f['id']})   # replace the id with id of file you want to access
      downloaded.GetContentFile(f['title'])

      with open(f['title']) as metadata_file:
        metadata_info = yaml.safe_load(metadata_file)
        metadata_info['book_id'] = folder_name

  metadata_info['file_name'] = file_name
  return metadata_info


In [None]:
# go through google drive folders recursively
def ListFolder(parent, metadata_list):
  file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % parent}).GetList()
  for f in file_list:
    if f['mimeType']=='application/vnd.google-apps.folder': # if folder
      metadata_info = get_metadata(f['id'], f['title'])
      metadata_list.append(metadata_info)
      metadata_info = ListFolder(f['id'], metadata_list)

In [None]:
# main
metadata_list = []
ListFolder('folder_Id', metadata_list)

In [None]:
# load the list as dataframe
df = pd.DataFrame(metadata_list) 
df.head(2)

In [None]:
# Export the metadata
df.to_csv("df_all.csv", index=False)

In [None]:
# if it does not have a tamil and english title, then drop those rows
columns = ['book_title', 'book_title_in_english']
df1 = df.dropna(subset=columns, how='all')
df1.to_csv("df1.csv", index=False)

In [None]:
# If the text file is missing, then drop that row
df2 = df1.loc[df1['file_name'] != ""]
df2.to_csv("df2.csv", index=False)

In [None]:
# Export the text files for download
!zip -r /content/file.zip /content/*.txt