In [102]:
import pandas as pd

In [103]:
df_path = "/home/user/Desktop/hw-training/2025-08-28/apify/dataset_instagram-scraper.json"
df = pd.read_json(df_path)


tesco_df = df.iloc[0]
sainsburys_df = df.iloc[1]
morrisons_df = df.iloc[2]

In [104]:
common_cols = [
    "url",
    "timestamp",
    "caption",
    "hashtags",
    "likesCount",
    "commentsCount",
    "videoViewCount",
    "type",
    "Image/Video URL",   
]

new_field_names = {
    "url" : "Post ID/URL",
    "timestamp": "Post Date/Time",
    "caption": "Post Text/Caption",
    "hashtags" :"Hashtags Used",
    "likesCount": "Likes Engagement",
    "commentsCount": "Comments Engagement",
    "videoViewCount": "Views (Video)",
    "type": "Media Type",
    "Image/Video URL": "Image/Video URL", 
}

def convert_data(brand_data, brand):
    posts = brand_data["latestPosts"]  
    df = pd.json_normalize(posts)

    df["Image/Video URL"] = df.apply(
        lambda row: row.get("videoUrl") if str(row.get("type")).lower() == "video" else row.get("displayUrl", ""),
        axis=1
    )

    df = df[common_cols].rename(columns=new_field_names)

    df.insert(0, "Brand", brand)
    df.insert(1, "brand_type", "")
    df.insert(2, "Platform", "Instagram")
    df.insert(5, "User Handle", "")
    df.insert(6, "User Type", "")
    df.insert(8, "Post Type", "")
    df.insert(5, "Mentions Type", "")
    df.insert(14, "Shares/Retweets", "")
    df.insert(16, "Common Keywords", "")
    df.insert(17, "Region", "")
    df.insert(18, "Language", "")  
    df.insert(19, "Top Theme Tags", "")
    df.insert(20, "Campaign Tag", "")

    return df



In [105]:
tesco_df = convert_data(tesco_df, "tesco")
sainsburys_df = convert_data(sainsburys_df, "sainsburys")
morrisons_df = convert_data(morrisons_df, "morrisons")

In [106]:
final_df = pd.concat([tesco_df, sainsburys_df, morrisons_df], ignore_index=True)

final_df["Hashtags Used"] = final_df["Hashtags Used"].astype(str).apply(
    lambda x: x if x.strip() != "[]" else  ""
)

final_df["Post Date/Time"] = pd.to_datetime(final_df["Post Date/Time"])
final_df["Views (Video)"] = pd.to_numeric(final_df["Views (Video)"], errors="coerce").astype("Int64")



final_df.to_csv("instagram_data.csv", index=False, encoding="utf-8")
