In [4]:

from datetime import datetime, timezone
import zipfile
#from ddpinspect import instagram

import pandas as pd
import json


In [5]:

def extractJsonContentFromZipFolder(zip_file_path, pattern):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Get the list of file names in the zip file
        file_names = zip_ref.namelist()
        
        targetdict = {}

        for file_name in file_names:
                if (file_name.endswith('.json')) and (pattern in file_name):
                    # Read the JSON file into a dictionary
                    with zip_ref.open(file_name) as json_file:
                        json_content = json_file.read()
                        data = json.loads(json_content)
                        targetdict[file_name] = data
                    break

                if file_name == file_names[-1]:
                    print(f"File {pattern}.json is not contained")
                    return None

    return targetdict[file_name]



In [7]:
def import_json_toDict(jsonfile):
    """loads json file as dict"""
    f = open(jsonfile)
    json_dict = json.load(f)
    return json_dict

def extract_topics_df(topics_dict):
    """takes the content of your_topics jsonfile, extracts topics and returns them as a dataframe"""
    if topics_dict != None:
        topics_list = [t['string_map_data']['Name']['value'] for t in topics_dict['topics_your_topics']]
        topics_df = pd.DataFrame(topics_list, columns=['your_topics'])
        return topics_df

#test
#topics_df = extract_topics_df(import_json_toDict('LOCAL_PATH_TO/your_topics.json'))
#topics_df

In [9]:
# which timezone do we assume?? function assumes UTC for now

def epoch_to_date(epoch_timestamp: str | int) -> str: #thanks ddp-inspector/ddpinspect/src/parserlib/stringparse.py
    """
    Convert epoch timestamp to an ISO 8601 string. Assumes UTC. -> UTC +1

    If timestamp cannot be converted raise CannotConvertEpochTimestamp
    """
    try:
        epoch_timestamp = int(epoch_timestamp)
        out = datetime.fromtimestamp(epoch_timestamp, tz=timezone.utc).isoformat()
    except (OverflowError, OSError, ValueError, TypeError) as e:
        logger.error("Could not convert epoch time timestamp, %s", e)
        raise CannotConvertEpochTimestamp("Cannot convert epoch timestamp") from e


    out = pd.to_datetime(out)
    return out.date()

    
# probably want to restrict the days to those within the study period?
def get_postViewsPerDay(posts_viewed_dict):
    """takes content of posts_viewed json file and returns dataframe with number of viewed posts/day"""
    timestamps = [t['string_map_data']['Time']['timestamp'] for t in posts_viewed_dict['impressions_history_posts_seen']] # get list with timestamps in epoch format
    dates = [epoch_to_date(t) for t in timestamps] # convert epochs to dates
    postViewedDates_df = pd.DataFrame(dates, columns=['date']) # convert to df
    aggregated_df = postViewedDates_df.groupby(["date"])["date"].size() # count number of rows per day
    return aggregated_df.reset_index(name='postsViewed_count')

# maybe combine results from get_postViewsPerDay and get_videoViewsPerDay in one dataframe? columns:  date | postsViewed_count | videosViewed_count
def get_videoViewsPerDay(videos_watched_dict):
    """takes content of videos_watched json file and returns dataframe with number of viewed posts/day"""
    timestamps = [t['string_map_data']['Time']['timestamp'] for t in videos_watched_dict["impressions_history_videos_watched"]] # get list with timestamps in epoch format
    dates = [epoch_to_date(t) for t in timestamps] # convert epochs to dates
    videosViewedDates_df = pd.DataFrame(dates, columns=['date']) # convert to df
    aggregated_df = videosViewedDates_df.groupby(["date"])["date"].size() # count number of rows per day
    return aggregated_df.reset_index(name='videosViewed_count')

#test
#get_postViewsPerDay(import_json_toDict('LOCAL_PATH_TO/posts_viewed.json'))
#get_videoViewsPerDay(import_json_toDict('LOCAL_PATH_TO/videos_watched.json'))

In [10]:
def doSomethingWithTheFile(filename): 
    """takes zip folder, extracts relevant json file contents (your_topics, posts_viewed, videos_watched), then extracts & processes relevant information and returns them as dataframes"""

    #your topics
    your_topics_file = extractJsonContentFromZipFolder(filename, "your_topics")
    yourTopics_df = extract_topics_df(your_topics_file)

    #aggregated post views/day
    posts_viewed_file = extractJsonContentFromZipFolder(filename, "posts_viewed")    
    postViewsperDay_df = get_postViewsPerDay(posts_viewed_file)

    #aggregated video views/day
    videos_viewed_file = extractJsonContentFromZipFolder(filename, "videos_watched")   
    videoViewsperDay_df = get_videoViewsPerDay(videos_viewed_file)

    return yourTopics_df, postViewsperDay_df, videoViewsperDay_df




In [11]:
# test
zip_file_path = '/home/maria/STUDIUM/Master/Hiwi Job/InstaDataDonationProject/ddp_insta/instagram-xxx-2023-11-14-kfw7z0xx.zip'

yourTopics_df, postViewsperDay_df, videoViewsperDay_df = doSomethingWithTheFile(zip_file_path)

display(yourTopics_df)
display(postViewsperDay_df)
display(videoViewsperDay_df)

Unnamed: 0,your_topics
0,Soccer
1,Cars & Trucks by Make & Model
2,Sensory Systems
3,Science & Engineering Disciplines
4,Foods
...,...
88,Memes
89,Science & Tech Companies
90,Architecture
91,Desserts


Unnamed: 0,date,postsViewed_count
0,2023-11-13,1
1,2023-11-14,4


Unnamed: 0,date,videosViewed_count
0,2023-11-13,2
1,2023-11-14,1
