# Convert tab files obtained from Youtube Data Tool into csv
Execute the whole notebook. 

This will convert all tab files in data_raw/dataool_manual directory into a csv file of the desired shape (same columns as with data_scraping.ipynb).

Comments will be stored in ../comments

tab files will be moved to /processed

In [1]:
import pandas as pd
from pathlib import Path
import os
import re

In [2]:
def extract_comments_from_data_tool(tab_file):
    """
    tabfile: name of comments.tab file in data_raw/datatool_manual
    
    Stores comments retrieved manually through the youtube data tools in the same csv format as our pipeline does
    Automatically checks if the basicinfo.tab file exists as well.
    """
    
    data = pd.read_csv("data_raw/datatool_manual/" + tab_file, sep='\t', header=(0))
    data['threadId'] = data.id.apply(lambda x: x.split('.')[0])
    
    try:
        info = pd.read_csv("data_raw/datatool_manual/" + tab_file[:-12] + "basicinfo.tab",sep='\t', names=[0, 1])
        info = info.T
        info.columns = info.T[0]
        info = info.drop(0)
        
        data['video_id'] = info.id.iloc[0]
        data['video_published_at'] = info.published.iloc[0]
    except:
        data['video_id'] = tab_file.split('_')[1]

    data.drop(labels=['authorChannelUrl', 'isReplyTo', 'isReplyToName'], axis=1, inplace=True)
    data.rename(columns={'isReply':'is_reply', "publishedAt":"published_at", "authorName":"author_name"}, inplace=True)
    data.to_csv('data_raw/comments/' + tab_file[:-4] + '_comments_' + str(len(data)) + '.csv', index = True) 
    return data

In [3]:
#tab_file = "videoinfo_wbR-5mHI6bo_2021_02_12-15_36_11_comments.tab"

In [4]:
#res = extract_comments_from_data_tool(tab_file)
#res.tail()

In [5]:
def process_folder():
    ''' Applies extract_comments_from_data_tool() to all comments.tab files in the datatool_manual folder.
        Then moves them to /processed
    '''
    csv_folder = Path("data_raw/datatool_manual").absolute()
    for f in csv_folder.iterdir():
        if f.name.endswith('comments.tab'):
            extract_comments_from_data_tool(f.name)
            #tab_name = re.split('/',str(f))[-1]
            dir_name = str(f)[:-len(f.name)]
            os.rename(str(f), dir_name + 'processed/' + f.name)
            try:
                os.rename(str(f)[:-12] + "basicinfo.tab", dir_name + 'processed/' + f.name[:-12] + "basicinfo.tab")
            except:
                pass

In [6]:
process_folder()