# Instagram Profile Data Processing

This notebook focuses on preprocessing Instagram profile data, specifically cleaning and preparing the text data (biographies and captions). The extracted features will be handled separately in `models.ipynb`.

## Objectives
1. Load and clean profile data.
2. Prepare data for feature extraction and model training in `models.ipynb`.

# 1. Preparing the Cleaned `text` Variable

In this step:
- Data is loaded from a `.gz` file.
- Selected fields (`username`, `full_name`, `biography`, `category_name`, `captions`) are concatenated.
- The concatenated text is:
  1. Tokenized using `split`.
  2. Filtered to remove non-alphabetic words.
  3. Lemmatized and cleaned using `trnlp`.
  4. Stripped of stop words.
- The cleaned data is written to a JSON file: `processed_profiles.json`.


In [3]:
import gzip
import json
from trnlp import TrnlpWord

def clean_text(text):

    lemmatizer = TrnlpWord()

    stopwords = set([
    "a", "acaba", "altı", "altmış", "ama", "ancak", "arada", "artık", "asla", "aslında", 
    "ayrıca", "az", "bana", "bazen", "bazı", "bazıları", "belki", "ben", "benden", "beni", 
    "benim", "beri", "beş", "bile", "bilhassa", "bin", "bir", "biraz", "birçoğu", "birçok", 
    "biri", "birisi", "birkaç", "birşey", "biz", "bizden", "bize", "bizi", "bizim", "böyle", 
    "böylece", "bu", "buna", "bunda", "bundan", "bunlar", "bunları", "bunların", "bunu", 
    "bunun", "burada", "bütün", "çoğu", "çoğunu", "çok", "çünkü", "da", "daha", "dahi", 
    "dan", "de", "defa", "değil", "diğer", "diğeri", "diğerleri", "diye", "doksan", "dokuz", 
    "dolayı", "dolayısıyla", "dört", "e", "edecek", "eden", "ederek", "edilecek", "ediliyor", 
    "edilmesi", "ediyor", "eğer", "elbette", "elli", "en", "etmesi", "etti", "ettiği", 
    "ettiğini", "fakat", "falan", "filan", "gene", "gereği", "gerek", "gibi", "göre", "hala", 
    "halde", "halen", "hangi", "hangisi", "hani", "hatta", "hem", "henüz", "hep", "hepsi", 
    "her", "herhangi", "herkes", "herkese", "herkesi", "herkesin", "hiç", "hiçbir", "hiçbiri", 
    "i", "ı", "için", "içinde", "iki", "ile", "ilgili", "ise", "işte", "itibaren", "itibariyle", 
    "kaç", "kadar", "karşın", "kendi", "kendilerine", "kendine", "kendini", "kendisi", 
    "kendisine", "kendisini", "kez", "ki", "kim", "kime", "kimi", "kimin", "kimisi", "kimse", 
    "kırk", "madem", "mi", "mı", "milyar", "milyon", "mu", "mü", "nasıl", "ne", "neden", 
    "nedenle", "nerde", "nerede", "nereye", "neyse", "niçin", "nin", "nın", "niye", "nun", 
    "nün", "o", "öbür", "olan", "olarak", "oldu", "olduğu", "olduğunu", "olduklarını", 
    "olmadı", "olmadığı", "olmak", "olması", "olmayan", "olmaz", "olsa", "olsun", "olup", 
    "olur", "olur", "olursa", "oluyor", "on", "ön", "ona", "önce", "ondan", "onlar", "onlara", 
    "onlardan", "onları", "onların", "onu", "onun", "orada", "öte", "ötürü", "otuz", "öyle", 
    "oysa", "pek", "rağmen", "sana", "sanki", "şayet", "şekilde", "sekiz", "seksen", "sen", 
    "senden", "seni", "senin", "şey", "şeyden", "şeye", "şeyi", "şeyler", "şimdi", "siz", 
    "sizden", "size", "sizi", "sizin", "sonra", "şöyle", "şu", "şuna", "şunları", "şunu", "ta", 
    "tabii", "tam", "tamam", "tamamen", "tarafından", "trilyon", "tüm", "tümü", "u", "ü", "üç", 
    "un", "ün", "üzere", "var", "vardı", "ve", "veya", "ya", "yani", "yapacak", "yapılan", 
    "yapılması", "yapıyor", "yapmak", "yaptı", "yaptığı", "yaptığını", "yaptıkları", "ye", "yedi", 
    "yerine", "yetmiş", "yi", "yı", "yine", "yirmi", "yoksa", "yu", "yüz", "zaten", "zira"
    ])

    text = " ".join(text.split())

    tokens = text.split()

    cleaned_tokens = []
    for token in tokens:
        if token.lower() not in stopwords and token.isalpha():  # Remove stopwords and non-alphabetic tokens
            lemmatizer.setword(token)  # Set the token for lemmatization
            lemma = lemmatizer.get_base  # Get the lemma of the word
            cleaned_tokens.append(lemma.lower())

    return " ".join(cleaned_tokens)

def remove_extra_spaces(text):
    return " ".join(text.split())


# File paths
gz_file_path = "training-dataset.jsonl.gz"
output_json_file_path = "processed_profiles.json"  # Desired output JSON file
printone  = True
# Open the .gz file and process each line
transformed_data = []
with gzip.open(gz_file_path, "rt", encoding="utf-8") as gz_file:
    for line in gz_file:
        try:
            # Parse each line as JSON
            data = json.loads(line.strip())
            if(printone):
                print(data)
                printone = False
            # Extract and transform data
            profile = data.get("profile", {})
            posts = data.get("posts", [])
            
            username = profile.get("username", "")
            full_name = profile.get("full_name", "")
            biography = profile.get("biography", "")
            category_name = profile.get("category_name", "")
            
            # Concatenate all captions, handling NoneType
            captions = " ".join(str(post.get("caption", "")) for post in posts if post.get("caption") is not None)
            
            
            # Create the transformed profile
            concatenated_text = f"{username} {full_name} {biography} {category_name} {captions}".strip()
            concatenated_text = clean_text(concatenated_text)
            concatenated_text = remove_extra_spaces(concatenated_text)
            transformed_profile = {
                "username": username,
                "text": concatenated_text,
                # "category": None
            }
            
            # Append to the transformed data list
            transformed_data.append(transformed_profile)
        
        except json.JSONDecodeError as e:
            print(f"Skipping line due to JSONDecodeError: {e}")

# Write the transformed data to the output JSON file
with open(output_json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(transformed_data, json_file, ensure_ascii=False, indent=4)

print(f"All transformed data saved to {output_json_file_path}")


All transformed data saved to processed_profiles.json


# 2. Merging with Annotated Accounts

In this step:
- The cleaned data is merged with the provided annotations file.
- Selected fields (`username`, `text`, and `category`) are retained.
- The merged data is saved as `merged_profiles.json`.

Please continue from models.ipynb for further explanation.

In [4]:
import json
import csv

# File paths
profiles_json_path = "processed_profiles.json"
classification_csv_path = "train-classification.csv"
output_json_path = "merged_profiles.json"

# Load the JSON file containing profiles
with open(profiles_json_path, "r", encoding="utf-8") as json_file:
    profiles_data = json.load(json_file)

# Convert profiles data into a dictionary for faster lookups
profiles_dict = {profile["username"]: profile for profile in profiles_data}

# Debugging: Print a sample of the profiles_dict keys
print(f"Sample of loaded usernames from JSON: {list(profiles_dict.keys())[:10]}")

# Initialize the output list
output_data = []

# Open and read the CSV file
with open(classification_csv_path, "r", encoding="utf-8") as csv_file:
    reader = csv.DictReader(csv_file, fieldnames=["username", "label"])  # Specify headers if missing
    
    # Skip the header row if it's included in the file
    next(reader, None)  # Comment this line if there is no header in the CSV

    for row in reader:
        username = row.get("username", "").strip()
        category = row.get("label", "").strip()

        # Debugging: Check if username exists in profiles_dict
        if username in profiles_dict:
            print(f"Match found for username: {username}")
            
            # If username matches, create the new structure
            profile = profiles_dict[username]
            new_entry = {
                "username": username,
                "text": profile["text"],
                "category": category.lower()
            }
            # Append to the output list
            output_data.append(new_entry)
        else:
            print(f"No match found for username: {username}")

# Debugging: Print the number of matches found
print(f"Total matches found: {len(output_data)}")

# Save the merged data to the output JSON file
with open(output_json_path, "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, ensure_ascii=False, indent=4)

print(f"Merged data saved to {output_json_path}")


Sample of loaded usernames from JSON: ['deparmedya', 'beyazyakaliyiz', 'kafesfirin', 'vimerang', 'totalenergies_istasyonlari', 'konforyatak', 'ht_kulup', 'ajansspor', 'yusufelibelediyesi08', '4bros.tr']
Match found for username: taskirancemal
Match found for username: tam_kararinda
Match found for username: spart4nn
Match found for username: sosyalyiyiciler
Match found for username: sonaydizdarahad
Match found for username: somersivrioglu
Match found for username: sinankoc
Match found for username: simulasyonturk
Match found for username: savas_karakas_sudaki_izler
Match found for username: sakinenurunannesi
Match found for username: ruyabuyuktetik
Match found for username: raykakumru
Match found for username: pintipanda
Match found for username: pinarindepresyonu
Match found for username: pinarhotic
Match found for username: pinardonmez_
Match found for username: ozgeninoltasi
Match found for username: nayaozgun
Match found for username: nataliyarcan
Match found for username: muthisps