In [None]:
import json

# Load the input JSON file
try:
    with open('sports1.json', 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Extract caption and hashtags from posts in each user profile
    filtered_data = []

    for profile in data:
        if 'latestIgtvVideos' in profile and isinstance(profile['latestIgtvVideos'], list):
            for post in profile['latestIgtvVideos']:
                if isinstance(post, dict) and 'caption' in post and 'hashtags' in post:
                    filtered_data.append({
                        'caption': post['caption'],
                        'hashtags': post['hashtags']
                    })

    print(f"Processed {len(filtered_data)} entries")

    # Save the filtered data to a new JSON file
    with open('filtered_output.json', 'w', encoding='utf-8') as file:
        json.dump(filtered_data, file, ensure_ascii=False, indent=2)

    print("Filtered data saved to 'filtered_output.json'")

except FileNotFoundError:
    print("Error: The file 'food1.json' was not found. Check your file path.")
except json.JSONDecodeError:
    print("Error: The file is not valid JSON. Check your JSON syntax.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

Processed 104 entries
Filtered data saved to 'filtered_output.json'


In [None]:
import json

# Replace 'abc' with your actual path
input_path = '/content/food1.json'
output_path = 'filtered_output.json'

# Read the original JSON file
with open(input_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extract only entries where both caption and hashtags are non-empty
filtered_data = []
for item in data:
    caption = item.get('caption', '').strip()
    hashtags = item.get('hashtags', [])

    if caption and hashtags:  # Both must be non-empty
        filtered_data.append({
            'caption': caption,
            'hashtags': hashtags
        })

# Write the filtered data to a new file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=4)

print(f"Filtered JSON saved to {output_path}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/food1.json'

In [None]:
pip install langdetect




In [None]:
import json
import re
from langdetect import detect, LangDetectException

input_path = 'filtered_output.json'
output_path = 'english_only_output.json'

def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

with open(input_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

cleaned_data = []
for item in data:
    raw_caption = item.get('caption', '')
    raw_hashtags = item.get('hashtags', [])

    # Remove #words from caption
    cleaned_caption = ' '.join(word for word in raw_caption.split() if not word.startswith('#')).strip()

    # Filter hashtags: keep only English ones
    cleaned_hashtags = [tag for tag in raw_hashtags if is_english(tag)]

    # Keep only if both caption and hashtags are non-empty and in English
    if cleaned_caption and cleaned_hashtags and is_english(cleaned_caption):
        cleaned_data.append({
            'caption': cleaned_caption,
            'hashtags': cleaned_hashtags
        })

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(cleaned_data, f, ensure_ascii=False, indent=4)

print(f"Cleaned English-only captions saved to {output_path}")


Cleaned English-only captions saved to english_only_output.json


In [None]:
import json

input_path = '/content/english_only_output.json'
output_path = '/content/final_cleaned_output.json'

# Load data
with open(input_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Remove @mentions from captions
cleaned_data = []
for item in data:
    caption = item.get('caption', '')
    hashtags = item.get('hashtags', [])

    # Remove words starting with '@'
    cleaned_caption = ' '.join(word for word in caption.split() if not word.startswith('@')).strip()

    # Skip if caption or hashtags end up empty
    if cleaned_caption and hashtags:
        cleaned_data.append({
            'caption': cleaned_caption,
            'hashtags': hashtags
        })

# Save cleaned data
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(cleaned_data, f, ensure_ascii=False, indent=4)

print(f"@mentions removed and saved to {output_path}")


@mentions removed and saved to /content/final_cleaned_output.json


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import json
import spacy

input_path = '/content/final_cleaned_output.json'
output_path = '/content/ner_anon_output.json'

# Load English NER model
nlp = spacy.load("en_core_web_sm")

with open(input_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

ner_data = []

for item in data:
    caption = item['caption']
    hashtags = item['hashtags']

    doc = nlp(caption)

    # Replace ORG entities with <ORG>
    new_caption = caption
    for ent in doc.ents:
        if ent.label_ == "ORG":
            new_caption = new_caption.replace(ent.text, "<ORG>")

    ner_data.append({
        'caption': new_caption,
        'hashtags': hashtags
    })

# Save the updated JSON
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(ner_data, f, ensure_ascii=False, indent=4)

print(f"NER processed captions saved to {output_path}")


NER processed captions saved to /content/ner_anon_output.json


In [None]:
import json
from sentence_transformers import SentenceTransformer, util

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Provided hashtags list
hashtags = [
    # General Popular Hashtags
    "#love", "#instagood", "#photooftheday", "#fashion", "#beautiful", "#happy", "#cute",
    "#tbt", "#like4like", "#followme", "#picoftheday", "#follow", "#me", "#selfie", "#summer",
    "#art", "#instadaily", "#friends", "#repost", "#nature", "#girl", "#fun", "#style",
    "#smile", "#food", "#instalike", "#likeforlike", "#family", "#travel", "#fitness",
    "#igers", "#tagsforlikes", "#follow4follow", "#nofilter", "#life", "#beauty", "#amazing",
    "#instamood", "#instagram", "#photography", "#vscocam", "#sun", "#photo", "#music",
    "#beach", "#followforfollow", "#bestoftheday", "#sky", "#ootd", "#sunset", "#dog", "#vsco",
    "#l4l", "#makeup", "#f4f", "#foodporn", "#hair", "#pretty", "#swag", "#cat", "#model",
    "#motivation", "#baby", "#party", "#cool", "#lol", "#gym", "#design", "#instapic",
    "#funny", "#healthy", "#night", "#tflers", "#yummy", "#flowers", "#lifestyle", "#hot",
    "#instafood", "#wedding", "#fit", "#handmade", "#black", "#pink", "#blue", "#work",
    "#workout", "#blackandwhite", "#drawing", "#inspiration", "#home", "#holiday",
    "#christmas", "#nyc", "#london", "#sea", "#instacool", "#goodmorning", "#iphoneonly",
    "#contest", "#giveaway", "#competition", "#win",

    # Travel & Adventure
    "#wanderlust", "#adventure", "#explore", "#travelgram", "#naturelovers", "#hiking",
    "#roadtrip", "#vacation", "#backpacking", "#sunrise", "#sunsetlovers", "#mountains",
    "#camping", "#ocean", "#islandlife", "#paradise", "#nationalpark", "#cityscape",

    # Fitness & Health
    "#fitfam", "#gymlife", "#workoutmotivation", "#running", "#yoga", "#healthylifestyle",
    "#strength", "#muscle", "#nutrition", "#calisthenics", "#strong", "#gains", "#powerlifting",
    "#crossfit", "#vegan", "#plantbased", "#cleaneating",

    # Fashion & Style
    "#streetstyle", "#mensfashion", "#womensfashion", "#styleinspo", "#ootdfashion", "#trend",
    "#fashionblogger", "#accessories", "#luxury", "#highfashion", "#vintage", "#boho",
    "#denim", "#minimaliststyle",

    # Tech & Gaming
    "#technology", "#gadget", "#techlife", "#coding", "#developer", "#programming",
    "#AI", "#machinelearning", "#gamers", "#esports", "#gamingcommunity", "#console",
    "#gaminglife", "#nft", "#blockchain", "#crypto",

    # Mental Health & Wellness
    "#mentalhealth", "#selfcare", "#positivity", "#mindfulness", "#healing", "#breathe",
    "#depressionawareness", "#anxietyrelief", "#meditation", "#therapy", "#kindness",

    # Relationships & Family
    "#couplegoals", "#relationshipgoals", "#bestfriend", "#siblings", "#happilyeverafter",
    "#parenting", "#momlife", "#dadlife", "#brother", "#sister",

    # Animals & Pets
    "#dogsofinstagram", "#catsofinstagram", "#petstagram", "#wildlife", "#puppylove",
    "#rescuedog", "#adoptdontshop", "#exoticpets",

    # Hobbies & Creative
    "#handlettering", "#pottery", "#woodworking", "#diy", "#painting", "#poetry",
    "#calligraphy", "#crafting", "#sketching", "#artsy", "#cinematography",

    # Food & Drinks
    "#coffeelover", "#brunch", "#homemade", "#dessert", "#chocolatelover", "#baking",
    "#smoothiebowl", "#wine", "#cocktails",

    # Cars & Motorcycles
    "#carporn", "#carlifestyle", "#supercars", "#motorcycle", "#bikelife", "#offroading",
    "#classiccars", "#trucklife",

    # Events & Holidays
    "#newyears", "#valentines", "#halloween", "#thanksgiving", "#easter", "#diwali",
    "#ramadan", "#hanukkah", "#birthday", "#graduation",

    # Motivational & Business
    "#hustle", "#entrepreneur", "#successmindset", "#startup", "#billionairemindset",
    "#sidehustle", "#wealth", "#bosslife", "#productivity", "#marketing", "#freelancer",

    # Science & Space
    "#science", "#astronomy", "#astrophysics", "#spaceexploration", "#NASA", "#cosmos",
    "#futuretech", "#quantumphysics",

    # Photography & Content Creation
    "#photochallenge", "#mobilephotography", "#dronephotography", "#filmphotography",
    "#portraitmode", "#streetphotography", "#cinematic", "#dslr",

    # Music & Entertainment
    "#livemusic", "#musiclover", "#concert", "#piano", "#guitar", "#vinyl", "#dj",
    "#hiphop", "#rockmusic", "#indie", "#festival",

    # Books & Learning
    "#bookstagram", "#bibliophile", "#readinglist", "#quotesoftheday", "#philosophy",
    "#history", "#education", "#lifelonglearning", "#selfimprovement",

    # Spiritual & Astrology
    "#spiritualawakening", "#zodiacsigns", "#horoscope", "#meditationpractice",
    "#lawofattraction", "#chakras", "#manifestation",

    # Environmental & Sustainability
    "#ecofriendly", "#sustainableliving", "#climatechange", "#zerowaste", "#recycle",
    "#veganlife", "#organic", "#savetheplanet",

    # Parenting & Kids
    "#babyfashion", "#toddlermom", "#dadjokes", "#momsofinstagram", "#parenthood",
    "#raisingkids", "#babyboy", "#babygirl",

    # Local & City Hashtags
    "#paris", "#newyorkcity", "#losangeles", "#dubai", "#tokyo", "#berlin", "#sydney",
    "#mumbai", "#toronto", "#brazil", "#bali", "#europe", "#latinamerica",

    # Miscellaneous Trends
    "#trending", "#viral", "#instalove", "#instafun", "#instasuccess", "#epic",
    "#hype", "#squad", "#random", "#tropical"
    # Special Days & Awareness
    "#worldenvironmentday", "#internationalwomensday", "#earthday", "#worldmentalhealthday",
    "#breastcancerawareness", "#worldaidsday", "#blackhistorymonth"
]

# Encode all hashtags once
hashtag_embeddings = model.encode(hashtags, convert_to_tensor=True)

# Load the existing travel2.json file
with open("/content/ner_anon_output.json", "r") as f:
    posts = json.load(f)

# Number of new hashtags to add
TOP_N = 5

# Update each post
for post in posts:
    caption = post.get("caption", "")
    existing_hashtags = post.get("hashtags", [])

    # Encode the caption
    caption_embedding = model.encode(caption, convert_to_tensor=True)

    # Compute similarity scores
    scores = util.pytorch_cos_sim(caption_embedding, hashtag_embeddings)[0]

    # Get indices of top N similar hashtags
    top_indices = scores.argsort(descending=True)[:TOP_N]

    # Extract top hashtags (without # prefix for consistency with existing)
    new_hashtags = [hashtags[i][1:] for i in top_indices]

    # Combine and remove duplicates while preserving order
    updated_hashtags = list(dict.fromkeys(existing_hashtags + new_hashtags))
    post["hashtags"] = updated_hashtags

# Save back to travel2.json
with open("/content/travel2.json", "w") as f:
    json.dump(posts, f, indent=2)
