In [8]:
import numpy as np
import pandas as pd
import os
import json

# Scraping the transcripts

In [47]:
## Check out file names
path_to_good_transcripts = '../data/raw/transcripts/'
transcript_files = [pos_json for pos_json in os.listdir(path_to_good_transcripts) if pos_json.endswith('.json')]
print("Good transcripts: ",transcript_files[:5])
print(f"Total good transcripts: {len(transcript_files)}")
path_to_bad_transcripts = '../data/raw/wrong_langs/'
transcript_files = [pos_json for pos_json in os.listdir(path_to_bad_transcripts) if pos_json.endswith('.json')]
print("Bad transcripts: ",transcript_files[:5])
print(f"Total bad transcripts: {len(transcript_files)}")

Good transcripts:  ['LEL175JU154.json', 'LEL295JU035.json', 'STP560JG118.json', 'OFC105SU068.json', 'COL385MU054.json']
Total good transcripts: 152
Bad transcripts:  ['se_2.mp4.json', 'en_6.mp4.json', 'de_1.mp4.json', 'cn_1.mp4.json', 'nl_2.mp4.json']
Total bad transcripts: 25


In [41]:
# Function for extracting all transcripts from a directory
def read_jsons(data_path):
    transcript_files = [pos_json for pos_json in os.listdir(data_path) if pos_json.endswith('.json')]

    contents = []
    for file_name in transcript_files:
        file_path = data_path + file_name
        with open(file_path) as file_content:
            data = json.load(file_content)
            text = data['results']['transcripts'][0]['transcript']
            contents.append(text)

    df = pd.DataFrame({'file_name': transcript_files, 'content' : contents})
    df.set_index('file_name', inplace=True)
    return df

In [42]:
path_to_good_transcripts = '../data/raw/transcripts/'
df_good = read_jsons(path_to_good_transcripts)
df_good.head()

Unnamed: 0_level_0,content
file_name,Unnamed: 1_level_1
LEL175JU154.json,(xx) could i have your attention please? Dr Ha...
LEL295JU035.json,let me ask a question. no yeah the font's too ...
STP560JG118.json,computer to work. um first we'd like to start ...
OFC105SU068.json,right. here uh the clipboard. that i ga- okay ...
COL385MU054.json,"this lecture uh, we are very pleased that the ..."


In [43]:
## Getting filenames
path_to_bad_transcripts = '../data/raw/wrong_langs/'
df_bad = read_jsons(path_to_bad_transcripts)
df_bad.head()

Unnamed: 0_level_0,content
file_name,Unnamed: 1_level_1
se_2.mp4.json,And Hey. Hey. Who did I have a name? For the O...
en_6.mp4.json,Tu English speaking Contress. Is the post you ...
de_1.mp4.json,Das is the hour. A cabo shown Al Kochen and De...
cn_1.mp4.json,Welcome to another Mandarin Corner video. For ...
nl_2.mp4.json,And elastic stylist. Lads were beginner. Lagma...


# Joining the two dataframes together

In [50]:
df_good['meaningful'] = 1
df_bad['meaningful']  = 0

df = pd.concat([df_good, df_bad])
df.head()

Unnamed: 0_level_0,content,meaningful
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LEL175JU154.json,(xx) could i have your attention please? Dr Ha...,1
LEL295JU035.json,let me ask a question. no yeah the font's too ...,1
STP560JG118.json,computer to work. um first we'd like to start ...,1
OFC105SU068.json,right. here uh the clipboard. that i ga- okay ...,1
COL385MU054.json,"this lecture uh, we are very pleased that the ...",1


In [52]:
processed_data_path = '../data/processed/'
file_name = 'all_transcripts.csv'
df.to_csv(processed_data_path + file_name, sep='\t')