In [1]:
import sys
import os
import json

import pandas as pd
import numpy as np

from collections import defaultdict

In [2]:
root_dir = 'C:/Users/delst/OneDrive/Desktop/Code/Workspace/JSON_to_PGSQL'
sys.path.append(root_dir)

from A_Main.Configurations.setup_env import setup_environment
config = setup_environment(root_dir)

In [3]:
raw_data_path = config.raw_data

In [4]:
sdo_parq = config.sdo_parq
sdo_feather = config.sdo_feather

---

In [5]:
with open(raw_data_path, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

---

# **Select Properties**

In [6]:
data = raw_data.get('data', {})
posts = data.get('children', [])
post_data = posts[2].get('data', {})
media_data = post_data.get('media', {})
reddit_video_data = media_data.get('reddit_video', {})
reddit_video_data

{'bitrate_kbps': 1200,
 'fallback_url': 'https://v.redd.it/evmp79fbr56b1/DASH_480.mp4?source=fallback',
 'has_audio': True,
 'height': 480,
 'width': 854,
 'scrubber_media_url': 'https://v.redd.it/evmp79fbr56b1/DASH_96.mp4',
 'dash_url': 'https://v.redd.it/evmp79fbr56b1/DASHPlaylist.mpd?a=1689435086%2CZGI5YzZlMTBkMWViZmZjM2E4NjZhODJlMTM3ZjVjMWNhYWI3NjM5YTcyZmE5ZjA2Mjk5OGUyZjQ5OWY3MDdiMQ%3D%3D&amp;v=1&amp;f=sd',
 'duration': 68,
 'hls_url': 'https://v.redd.it/evmp79fbr56b1/HLSPlaylist.m3u8?a=1689435086%2CMzA1YjFkMDUwYTIwZDkzZTZjMWVjMGZhYTJmNmI4N2Y0ZWJkYjQ1MjU2YzgwZDJiNmI4MmU0OWRhYTU4MGIyMQ%3D%3D&amp;v=1&amp;f=sd',
 'is_gif': False,
 'transcoding_status': 'completed'}

---

# **Post Data**

In [7]:
post_data_keys = set()

for post in posts:
    post_data = post.get('data', {})
    
    for key in post_data.keys():
        post_data_keys.add(key)
post_data_keys

{'all_awardings',
 'allow_live_comments',
 'approved_at_utc',
 'approved_by',
 'archived',
 'author',
 'author_flair_background_color',
 'author_flair_css_class',
 'author_flair_richtext',
 'author_flair_template_id',
 'author_flair_text',
 'author_flair_text_color',
 'author_flair_type',
 'author_fullname',
 'author_is_blocked',
 'author_patreon_flair',
 'author_premium',
 'awarders',
 'banned_at_utc',
 'banned_by',
 'can_gild',
 'can_mod_post',
 'category',
 'clicked',
 'content_categories',
 'contest_mode',
 'created',
 'created_utc',
 'discussion_type',
 'distinguished',
 'domain',
 'downs',
 'edited',
 'gallery_data',
 'gilded',
 'gildings',
 'hidden',
 'hide_score',
 'id',
 'is_created_from_ads_ui',
 'is_crosspostable',
 'is_gallery',
 'is_meta',
 'is_original_content',
 'is_reddit_media_domain',
 'is_robot_indexable',
 'is_self',
 'is_video',
 'likes',
 'link_flair_background_color',
 'link_flair_css_class',
 'link_flair_richtext',
 'link_flair_template_id',
 'link_flair_text',


In [8]:
post_data_store = []

for post in posts:
    post_data = post.get('data', {})
    data_list = []
    
    for key in post_data_keys:
        data_list.append(post_data.get(key, np.nan))
    
    post_data_store.append(data_list)

In [9]:
df_post_data = pd.DataFrame(post_data_store, columns=list(post_data_keys))
df_post_data.head()

Unnamed: 0,allow_live_comments,author_patreon_flair,secure_media,upvote_ratio,author_flair_css_class,link_flair_template_id,is_meta,contest_mode,banned_by,secure_media_embed,...,quarantine,thumbnail_width,gildings,send_replies,link_flair_text,can_gild,author_flair_template_id,removed_by,num_comments,approved_by
0,True,False,,0.96,,2491e734-cd17-11ed-9c62-06fbff1c98dd,False,False,,{},...,False,140.0,{},True,Clubhouse,True,,,1221,
1,False,False,,0.96,,62cc9266-f2b0-11ec-9096-baad01c86c30,False,False,,{},...,False,140.0,{},True,❔ Other,True,,,477,
2,True,False,"{'reddit_video': {'bitrate_kbps': 1200, 'fallb...",0.95,,5e058b84-4acb-11ed-ae2b-6a59e86b4614,False,False,,{},...,False,140.0,{},True,Video/Gif,True,,,1704,
3,True,False,,0.96,,,False,False,,{},...,False,140.0,{},True,,True,,,611,
4,False,False,"{'reddit_video': {'bitrate_kbps': 2400, 'fallb...",0.91,,b16ae4c8-c28e-11ed-9954-2e95b348321f,False,False,,{},...,False,140.0,{},True,Science,True,,,848,


---

# **Video Data**

In [10]:
reddit_video_keys = set()
for post in posts:
    post_data = post.get('data', {})
    media_data = post_data.get('media', {})
    
    if media_data != None:
        reddit_video = media_data.get('reddit_video', {})
        reddit_video_keys.update(reddit_video.keys())
reddit_video_keys = sorted(list(reddit_video_keys))

In [11]:
video_data_store = []

for post in posts:
    post_data = post.get('data', {})
    post_id = post_data.get('id', None)
    
    media_data = post_data.get('media', {})
    
    if media_data != None:
        reddit_video = media_data.get('reddit_video', {})
        
        # Executes if dict is not empty or is not false
        if reddit_video:
            video_data = [post_id] + [reddit_video.get(key) for key in reddit_video_keys]
            video_data_store.append(video_data)
        
        # Execute if a dict {}
        # if isinstance(reddit_video, dict):
        #     single_data = [post_id] + [reddit_video.get(key) for key in reddit_video_keys]
        #     video_data_store.append(single_data)

In [12]:
df_video_data = pd.DataFrame(video_data_store, columns=['post_id'] + reddit_video_keys)
df_video_data.head()

Unnamed: 0,post_id,bitrate_kbps,dash_url,duration,fallback_url,has_audio,height,hls_url,is_gif,scrubber_media_url,transcoding_status,width
0,149yrlt,1200,https://v.redd.it/evmp79fbr56b1/DASHPlaylist.m...,68,https://v.redd.it/evmp79fbr56b1/DASH_480.mp4?s...,True,480,https://v.redd.it/evmp79fbr56b1/HLSPlaylist.m3...,False,https://v.redd.it/evmp79fbr56b1/DASH_96.mp4,completed,854
1,149z3aw,2400,https://v.redd.it/9az2zw4ou56b1/DASHPlaylist.m...,58,https://v.redd.it/9az2zw4ou56b1/DASH_720.mp4?s...,,720,https://v.redd.it/9az2zw4ou56b1/HLSPlaylist.m3...,False,https://v.redd.it/9az2zw4ou56b1/DASH_96.mp4,completed,406
2,149wwzb,2400,https://v.redd.it/vstcrxxm756b1/DASHPlaylist.m...,178,https://v.redd.it/vstcrxxm756b1/DASH_720.mp4?s...,,720,https://v.redd.it/vstcrxxm756b1/HLSPlaylist.m3...,False,https://v.redd.it/vstcrxxm756b1/DASH_96.mp4,completed,406
3,14a36xk,2400,https://v.redd.it/regbdl8au66b1/DASHPlaylist.m...,43,https://v.redd.it/regbdl8au66b1/DASH_720.mp4?s...,False,720,https://v.redd.it/regbdl8au66b1/HLSPlaylist.m3...,True,https://v.redd.it/regbdl8au66b1/DASH_96.mp4,completed,1280
4,149zpnd,1200,https://v.redd.it/0zpvfvyk066b1/DASHPlaylist.m...,354,https://v.redd.it/0zpvfvyk066b1/DASH_480.mp4?s...,True,394,https://v.redd.it/0zpvfvyk066b1/HLSPlaylist.m3...,False,https://v.redd.it/0zpvfvyk066b1/DASH_96.mp4,completed,854


---

# **Image Data**

In [13]:
from collections import OrderedDict

In [14]:
image_data_keys = set()
image_source_keys = set()
image_resolutions_keys = set()

for post in posts:
    post_data = post.get('data', {})
    preview_data = post_data.get('preview', {})
    
    if preview_data != None:
        image_data = preview_data.get('images', [])

        for image in image_data:
            
            data_keys = image.keys()
            source_keys = image.get('source', {}).keys()
            
            image_data_keys.update(data_keys)
            image_source_keys.update(source_keys)
            
            for i in image.get('resolutions', []):
                res_keys = i.keys()
                image_resolutions_keys.update(res_keys)
    
image_data_keys = image_data_keys - {'resolutions', 'source'}

image_data_keys = sorted(list(image_data_keys))
image_source_keys = sorted(list(image_source_keys))
image_resolutions_keys = sorted(list(image_resolutions_keys))

In [16]:
image_data_store = sorted([])
image_source_store = sorted([])
image_resolution_store = sorted([])

for post in posts:
    post_data = post.get('data', {})
    post_id = post_data.get('id', None)
    
    preview_data = post_data.get('preview', {})
    
    if preview_data is not None:
        images = preview_data.get('images', [])
        
        for image in images:
            image_id = image.get('id', None)
            
            # Collect image data
            image_data = [post_id] + [image.get(key) for key in image_data_keys]
            image_data_store.append(image_data)
            
            # Collect source data
            source = image.get('source', {})
            if source:
                source_data = [post_id, image_id] + [source.get(key) for key in image_source_keys]
                image_source_store.append(source_data)
            
            # Collect resolution data
            # resolution = image.get('resolutions', [])
            # if resolution:
            #     resolution_data = [post_id, image_id] + [i.get(key) for key in image_resolutions_keys for i in resolution]
            #     image_resolution_store.append(resolution_data)
                
            # Collect resolution data for each image
            for item in image.get('resolutions', []):
                res_data = [
                    post_id,
                    image_id,
                    item.get('height'),
                    item.get('url'),
                    item.get('width'),
                ]
                image_resolution_store.append(res_data)

In [17]:
df_image_data = pd.DataFrame(image_data_store, columns=['post_id'] + image_data_keys)
df_image_source = pd.DataFrame(image_source_store, columns=['post_id','id'] + image_source_keys)
df_image_resolution = pd.DataFrame(image_resolution_store, columns=['post_id','id'] + image_resolutions_keys)

In [18]:
display(df_image_data.head())
display(df_image_source.head())
display(df_image_resolution)

Unnamed: 0,post_id,id,variants
0,14a0ayt,4VWgAl4VJ8GxVz6JYR6o2aXvzunbKRAKU0VefcwCayg,{}
1,14a1hq5,gAk-dl1pjVUL88HkOOUMiT7PqP6wZncsweXFa-i6fUI,{}
2,149yrlt,NndmejhuYmJyNTZiMb0UmIeULhm5TqEhpHt9vw3gmBGF49...,{}
3,14a064k,YQCgKRT3PwoUJd7UixJDP4_jcfVx1QEnV0W2TQH8uPo,{}
4,149z3aw,_UjbEUYm4O61zhRgxj2pPo4GfL-JF2P5N7mdZq9pNTE,{}


Unnamed: 0,post_id,id,height,url,width
0,14a0ayt,4VWgAl4VJ8GxVz6JYR6o2aXvzunbKRAKU0VefcwCayg,1628,https://preview.redd.it/urxp0hrc666b1.png?auto...,1025
1,14a1hq5,gAk-dl1pjVUL88HkOOUMiT7PqP6wZncsweXFa-i6fUI,1306,https://preview.redd.it/u9e31qnjg66b1.jpg?auto...,1046
2,149yrlt,NndmejhuYmJyNTZiMb0UmIeULhm5TqEhpHt9vw3gmBGF49...,540,https://external-preview.redd.it/NndmejhuYmJyN...,960
3,14a064k,YQCgKRT3PwoUJd7UixJDP4_jcfVx1QEnV0W2TQH8uPo,800,https://external-preview.redd.it/46hLWMpB_3Nh4...,1200
4,149z3aw,_UjbEUYm4O61zhRgxj2pPo4GfL-JF2P5N7mdZq9pNTE,1024,https://external-preview.redd.it/WcRhmEDaOB-IJ...,576


Unnamed: 0,post_id,id,height,url,width
0,14a0ayt,4VWgAl4VJ8GxVz6JYR6o2aXvzunbKRAKU0VefcwCayg,171,https://preview.redd.it/urxp0hrc666b1.png?widt...,108
1,14a0ayt,4VWgAl4VJ8GxVz6JYR6o2aXvzunbKRAKU0VefcwCayg,343,https://preview.redd.it/urxp0hrc666b1.png?widt...,216
2,14a0ayt,4VWgAl4VJ8GxVz6JYR6o2aXvzunbKRAKU0VefcwCayg,508,https://preview.redd.it/urxp0hrc666b1.png?widt...,320
3,14a0ayt,4VWgAl4VJ8GxVz6JYR6o2aXvzunbKRAKU0VefcwCayg,1016,https://preview.redd.it/urxp0hrc666b1.png?widt...,640
4,14a0ayt,4VWgAl4VJ8GxVz6JYR6o2aXvzunbKRAKU0VefcwCayg,1524,https://preview.redd.it/urxp0hrc666b1.png?widt...,960
...,...,...,...,...,...
102,149vuvp,dlXFVWswaCF6GooP3Nh-vUvJ3wacL0XCyygdkQ4l_Zo,193,https://preview.redd.it/il6cqa38w46b1.jpg?widt...,216
103,149vuvp,dlXFVWswaCF6GooP3Nh-vUvJ3wacL0XCyygdkQ4l_Zo,287,https://preview.redd.it/il6cqa38w46b1.jpg?widt...,320
104,149vuvp,dlXFVWswaCF6GooP3Nh-vUvJ3wacL0XCyygdkQ4l_Zo,574,https://preview.redd.it/il6cqa38w46b1.jpg?widt...,640
105,149vuvp,dlXFVWswaCF6GooP3Nh-vUvJ3wacL0XCyygdkQ4l_Zo,862,https://preview.redd.it/il6cqa38w46b1.jpg?widt...,960


---

In [19]:
df_dict = {
    'post_data': df_post_data,
    'video_data': df_video_data,
    'image_data': df_image_data,
    'image_source': df_image_source,
    'image_resolution': df_image_resolution
}

# save to feather
for filename, df in df_dict.items():
    df.to_feather(os.path.join(sdo_feather, filename + '.feather'))

In [20]:
df_names = ['post_data', 'video_data', 'image_data', 'image_source', 'image_resolution']
df_dict = {}

for df_name in df_names:
    df_dict[df_name] = pd.read_feather(os.path.join(sdo_feather, df_name + '.feather'))

# Now you can access your dataframes like this:
df_post_data = df_dict['post_data']
df_video_data = df_dict['video_data']
df_image_data = df_dict['image_data']
df_image_source = df_dict['image_source']
df_image_resolution = df_dict['image_resolution']

In [21]:
df_post_data

Unnamed: 0,allow_live_comments,author_patreon_flair,secure_media,upvote_ratio,author_flair_css_class,link_flair_template_id,is_meta,contest_mode,banned_by,secure_media_embed,...,quarantine,thumbnail_width,gildings,send_replies,link_flair_text,can_gild,author_flair_template_id,removed_by,num_comments,approved_by
0,True,False,,0.96,,2491e734-cd17-11ed-9c62-06fbff1c98dd,False,False,,{},...,False,140.0,{},True,Clubhouse,True,,,1221,
1,False,False,,0.96,,62cc9266-f2b0-11ec-9096-baad01c86c30,False,False,,{},...,False,140.0,{},True,❔ Other,True,,,477,
2,True,False,"{'reddit_video': {'bitrate_kbps': 1200, 'dash_...",0.95,,5e058b84-4acb-11ed-ae2b-6a59e86b4614,False,False,,{},...,False,140.0,{},True,Video/Gif,True,,,1704,
3,True,False,,0.96,,,False,False,,{},...,False,140.0,{},True,,True,,,611,
4,False,False,"{'reddit_video': {'bitrate_kbps': 2400, 'dash_...",0.91,,b16ae4c8-c28e-11ed-9954-2e95b348321f,False,False,,{},...,False,140.0,{},True,Science,True,,,848,
5,False,False,,0.95,,,False,False,,{},...,False,140.0,{},True,,True,,,184,
6,True,False,,0.92,,7d4d8376-a816-11e9-a92d-0e6b9fa95170,False,False,,{},...,False,140.0,{},True,Social Media,True,,,1631,
7,True,False,"{'reddit_video': {'bitrate_kbps': 2400, 'dash_...",0.94,,43699c52-6af0-11e9-9e2b-0ee05c7bc6f8,False,False,,{},...,False,140.0,{},False,Humor/Cringe,True,,,1737,
8,True,False,,0.88,,,False,False,,{},...,False,140.0,{},True,,True,,,213,
9,True,False,"{'reddit_video': {'bitrate_kbps': 2400, 'dash_...",0.96,,,False,False,,{},...,False,140.0,{},True,Video,True,,,206,


----