## Process Instagram JSON file.

Since directly parsing data from instagram is against their **Terms and Conditions**, we will need to manually download the json file through your account.

Instructions to download the json file:

**Open the instagram account** - below Setting and Privacy **Your Activity** - **Download your information**

When you are asked what to download, uncheck everything except messages. This will force script to work properly.

You can choose where to download the file. Whether download on this device, or on the cloud.

This notebook contains to parse locally or google drive. You will be able to choose on your own.

**If you are parsing from google drive, recommended running this notebook from Google Colab**

In [None]:
if __name__ == "__main__":
      print("Loading modules...")
      import json 
      import os
      import yaml
      import numpy as np
      import pandas as pd
      

In [None]:
config_path = os.path.join(os.path.dirname(os.getcwd()), "config.yaml")
with open(config_path, 'r') as f:
    full_config = yaml.safe_load(f)

personal_parameters = full_config.get('personal_parameters', {})

instagram_username = personal_parameters.get('INSTAGRAM_USERNAME')
inbox_path = os.path.join(os.path.dirname(os.getcwd()), "parsers/instagram/your_instagram_activity 4/messages/inbox")

if not instagram_username: 
      raise ValueError("Instagram username is not set in the .env file.")

In [None]:
# Since instagram uses special encoding, we are going to decode it
def decode_utf8(encoded_str):
      # Decoding the string
      try: 
            decoded_str = str(encoded_str)
            decoded_str = encoded_str.encode('latin1').decode('utf-8')
            return decoded_str
      except AttributeError:
            pass 

In [None]:
def extract_dialog(json_file_path, message_limit: int = None, dialogs_limit: int = None, verbose=1, checkpoints: bool = True, threshold: int = 50): 
      with open(json_file_path, 'r', errors='replace') as file:
            dialog = json.load(file)
            if threshold and len(dialog["messages"]) < threshold:
                  return
            
            last_message = None
            extracted_dialog = []
            for message_data in dialog["messages"]:
                  encoded_message = message_data["content"] if "content" in message_data else np.nan
                  message = decode_utf8(encoded_message)

                  sender = message_data["sender_name"]

                  timestamp = message_data["timestamp_ms"]
                  timestamp = pd.to_datetime(timestamp, unit='ms')
                  
                  if message:
                        if last_message and sender == last_message[1]:
                              if last_message[0][-1] not in [".", "!", "?"]:
                                    last_message[0] = last_message[0] + ","
                              
                              last_message[0] = " ".join([message.lower(), last_message[0]])
                        else:
                              if last_message:
                                    extracted_dialog.append(last_message)
                              last_message = [message, sender, timestamp]
            return extracted_dialog

In [None]:
if not os.path.exists(inbox_path):
            print(f"Directory '{inbox_path}' for instagram folder wasn't found.\nTry to change the path to your_instagram_activity -> messages -> inbox.")


# if verbose: 
#       print(f"Discord data is processed from Path: {path}")
df = pd.DataFrame(columns=['Message', 'Sender', 'Date']) 
# Collects all dialogs, connects into full path and processes  
for root, dirs, files in os.walk(inbox_path):
      for file in files:
            if file == 'message_1.json':
                  json_file_path = os.path.join(root, file)

                  data = extract_dialog(json_file_path)
                  data = pd.DataFrame(data, columns=['Message', 'Sender', 'Date'])
                  df = pd.concat([df, data])

      df["Sent_by_me"] = df["Sender"] == str(instagram_username)
                  

In [None]:
df

In [None]:
df.to_csv("/Users/bohdan/Documents/Programming/Projects/VSCode/AI-DataScience/PersonaGPT/Datasets/ffffff")