In [12]:
import json
import csv
import pandas
import re
from datetime import datetime, timedelta

In [13]:
# Read in the raw JSON file
json_file = open("data/Takeout/YouTube and YouTube Music/history/watch-history.json")

In [14]:
# Load as dict
data = json.load(json_file)

In [15]:
def remove_ads(data):
    '''
    Filters ads from watched videos.

    Args:
        data: The list of watched videos.
 
    Returns:
        list[dict]: A list of watched videos without ads included.
    '''

    no_ads = []
    print(f"before removing ads: {len(data)}")
    for vid in data:
        if 'details' in vid and 'name' in vid['details'][0] and "From Google Ads" in vid['details'][0]['name']:
            continue
        no_ads.append(vid)

    print(f"after removing ads: {len(no_ads)}")

    return no_ads


In [16]:
data = remove_ads(data)

before removing ads: 45000
after removing ads: 31332


In [17]:
def UTC_convert(date_time_str, start_analysis, end_analysis, start_timeframe_1, end_timeframe_1, start_timeframe_2, end_timeframe_2):
    '''
    Updates dates and times to match the correct timezones. If a timestamp lands between the specified timeframes, it 
    is updated according to Malaysian time (+8 hours). Otherwise, it is updated according to Melbourne time (+11 hours).
    Filters out timestamps that are not in 2023.

    Args:
        date_time_str: The string representation of the timestamp to update.
        start_analysis: The start of the entire period to analyse.
        end_analysis: The end of the entire period to analyse.
        start_timeframe_1: The start of timeframe 1.
        end_timeframe_1: The end of timeframe 1.
        start_timeframe_2: The start of timeframe 2.
        end_timeframe_2: The end of timeframe 2.
    
    Returns:
        str: The string representation of the updated timestamp.
        or 
        None: If the timestamp is not within 2023.
    '''
    
    # Convert the date-time string to a datetime object
    date_time = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M")

    # Update `date_time`s by different amounts depending on the timeframe 
    # curr_time = date_time.datetime()
    updated_date_time = None
    if not start_analysis <= date_time <= end_analysis:
        # Return None if timestamp is out of the analysis period
        return updated_date_time
    elif start_timeframe_1 <= date_time <= end_timeframe_1 or start_timeframe_2 <= date_time <= end_timeframe_2:
        updated_date_time = date_time + timedelta(hours=8)
    else:
        updated_date_time = date_time + timedelta(hours=11)

    # Convert the updated datetime object back to a string
    updated_date_time_str = updated_date_time.strftime("%Y-%m-%dT%H:%M")

    return updated_date_time_str

In [18]:
def preprocess_vids(data):
    '''
    Preprocesses watched videos by:
    1. Removing videos that were watched outside of 2023, incrementing dates/times to match the correct timezones depending 
        on my travel history, and creating two separate fields for "date" and "time" to support downstream analysis
    2. Removing "Watched" which is sometimes prepended to video titles
    3. Making a new un-nested field "channel" for the channel name, which is "unknown" if this is unknown 

    Args:
        updated_data: The list of watched videos.

    Returns:
        list[dict]: The list of preprocessed videos.
    '''

    # Define timeframes as datetime objects.
    # `start_analysis` and `end_analysis` indicate the desired period to analyse.
    start_analysis = datetime.strptime("2022-12-31T16:00", "%Y-%m-%dT%H:%M")
    end_analysis = datetime.strptime("2024-2-1T11:00", "%Y-%m-%dT%H:%M")
    # `start_timeframe`s and `end_timeframe`s indicate the timeframes when I was overseas.
    start_timeframe_1 = datetime.strptime("2023-06-24T20:30", "%Y-%m-%dT%H:%M")
    end_timeframe_1 = datetime.strptime("2023-06-30T10:50", "%Y-%m-%dT%H:%M")
    start_timeframe_2 = datetime.strptime("2022-12-31T13:00", "%Y-%m-%dT%H:%M")
    end_timeframe_2 = datetime.strptime("2023-02-24T14:20", "%Y-%m-%dT%H:%M")

    within_2023 = []
    for vid in data:
        # Update datetime to match timezones, and split the datetime into date and time fields
        vid['time'] = ':'.join(vid['time'].split(':')[:-1])
        updated_date_time = UTC_convert(vid['time'], start_analysis, end_analysis, start_timeframe_1, end_timeframe_1, start_timeframe_2, end_timeframe_2)
        if updated_date_time is None:
            continue
        updated_date_time = updated_date_time.split('T')
        vid['date'] = updated_date_time[0]
        vid['time'] = updated_date_time[1]
        within_2023.append(vid)


    for vid in within_2023:

        vid['title'] = re.sub(r'Watched\s|,|\r', '', vid['title'])

        if 'subtitles' in vid and 'name' in vid['subtitles'][0]:
            vid['channel'] = re.sub(r',|\r', '', vid['subtitles'][0]['name'])
        else:
            # Channel name is unknown
            vid['channel'] = 'unknown'
    
    return within_2023
        
        

In [19]:
data = preprocess_vids(data)

In [20]:
# Check that data was preprocessed as expected
data[0]

{'header': 'YouTube',
 'title': 'A 3am Classical Playlist',
 'titleUrl': 'https://www.youtube.com/watch?v=WYmwP7F8-X8',
 'subtitles': [{'name': 'Classical Radio',
   'url': 'https://www.youtube.com/channel/UC3DFPlJRkK9VWckgfBCtwJw'}],
 'time': '03:57',
 'products': ['YouTube'],
 'activityControls': ['YouTube watch history'],
 'date': '2024-01-31',
 'channel': 'Classical Radio'}

In [21]:
def json_to_csv(data, fields):
    '''
    Saves JSON object to CSV file, only writing the specified fields within each item. 

    Args:
        data: The list of watched videos.
        fields: The list of fields to write
    '''
    
    with open('data/watch_history.csv', 'w', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)

        # Write the header
        csv_writer.writerow(fields)

        # Iterate through each JSON object in the list
        for obj in data:
            # Extract values for the specified fields from each JSON object
            row_values = [obj.get(field, '') for field in fields]

            # Write the row to the CSV file
            csv_writer.writerow(row_values)

In [22]:
json_to_csv(data, ["title", "date", "time", "channel"])