In [None]:
# randomized search
import pandas as pd
import gzip
import json
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.preprocessing import StandardScaler
import string
import unicodedata
import nltk
from nltk.corpus import stopwords
!pip install stanza
import stanza

# For RandomizedSearchCV parameter sampling
from scipy.stats import randint

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Initialize Stanza for Turkish
stanza.download('tr')
nlp = stanza.Pipeline('tr', processors='tokenize', tokenize_no_ssplit=True)

# Load the Turkish stopwords
turkish_stopwords = set(stopwords.words('turkish'))

# File paths
labels_path = "/content/drive/My Drive/Colab Notebooks/train-classification.csv"
features_path = "/content/drive/My Drive/Colab Notebooks/training-dataset.jsonl"
test_path = "/content/drive/My Drive/Colab Notebooks/test-classification-round3.dat"

# 1. Load Features
features = []
with open(features_path, 'rt', encoding='utf-8') as file:
    for line in file:
        features.append(json.loads(line))
features_df = pd.DataFrame(features)
print("Features loaded.")

# 2. Load Labels
labels_df = pd.read_csv(labels_path)

# Map categories to numbers
category_mapping = {
    'Mom and Children': 0,
    'Food': 1,
    'Travel': 2,
    'Gaming': 3,
    'Tech': 4,
    'Health and Lifestyle': 5,
    'Fashion': 6,
    'Sports': 7,
    'Entertainment': 8,
    'Art': 9
}
labels_df['label'] = labels_df['label'].map(category_mapping)

# Drop NaN labels
labels_df = labels_df.dropna(subset=['label'])
print("Labels loaded and mapped.")

def preprocess_caption(caption):
    # Normalize Unicode characters
    caption = unicodedata.normalize('NFKC', caption)
    caption = caption.casefold()
    # Remove URLs
    caption = re.sub(r'http\S+|www\S+|https\S+', '', caption, flags=re.MULTILINE)
    # Remove special characters and punctuation
    caption = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', caption)
    # Remove numbers
    caption = re.sub(r'\d+', '', caption)
    # Remove extra whitespaces
    caption = re.sub(r'\s+', ' ', caption).strip()
    return caption

def count_food_words_exact(posts, food_words):
    """
    Counts the number of food-related words in all captions of a user's posts using exact word matching.
    """
    if not isinstance(posts, list):
        return 0  # Return 0 if posts is not a list

    count = 0
    for post in posts:
        caption = post.get('caption', '')
        if not isinstance(caption, str):
            continue  # Skip if caption is not a string

        caption = preprocess_caption(caption)
        doc = nlp(caption)
        words = [word.text for sent in doc.sentences for word in sent.words]
        # Optionally remove stopwords
        words = [word for word in words if word not in turkish_stopwords]

        count += sum(1 for word in words if word in food_words)

    return count

# 3. Merge Features and Labels
features_df['username'] = features_df['profile'].apply(lambda x: x['username'])
merged_data = features_df.merge(labels_df, left_on='username', right_on='Unnamed: 0', how='inner')

# 4. Feature Extraction
merged_data['followers_count'] = merged_data['profile'].apply(lambda x: x.get('followers_count', 0))
merged_data['following_count'] = merged_data['profile'].apply(lambda x: x.get('following_count', 0))
merged_data['media_count'] = merged_data['profile'].apply(lambda x: x.get('media_count', 0))
merged_data['num_posts'] = merged_data['posts'].apply(len)

# 🟢 **Integration Start: Additional Food Words + Extra Numeric Features**

# 4.1. Define an Expanded List of Food-Related Words (Turkish & English)
food_words = [
    # Turkish Food Words
    'elma', 'ekmek', 'et', 'balık', 'peynir', 'çorba', 'salata', 'kahve', 'çay',
    'tavuk', 'pirinç', 'makarna', 'süt', 'yumurta', 'sebze', 'meyve', 'biber',
    'zeytin', 'hamsi', 'köfte', 'kebab', 'dolma', 'lahana', 'patates', 'soğan',
    'domates', 'salça', 'biberiye', 'tarçın', 'şeker', 'tuz', 'baharat', 'meyve suyu',
    'fırın', 'börek', 'tatlı', 'dondurma', 'pizza', 'hamburger', 'sosis', 'döner',
    'sütlü tatlı', 'tiramisu', 'tarif', 'baliktarifi', 'tarifini', 'tariflerimiz', 'tarifimizle',
    'baklava', 'lokma', 'mücver', 'gözleme', 'çılbır', 'şiş', 'lahmacun', 'cacık',
    'midye', 'künefe', 'kadayıf', 'lokum', 'büryan', 'tantuni', 'menemen',

    # English Food Words
    'apple', 'bread', 'meat', 'fish', 'cheese', 'soup', 'salad', 'coffee', 'tea',
    'chicken', 'rice', 'pasta', 'milk', 'egg', 'vegetable', 'fruit', 'pepper',
    'olive', 'herring', 'kofta', 'kebab', 'stuffed grape leaves', 'cabbage', 'potato', 'onion',
    'tomato', 'paste', 'rosemary', 'cinnamon', 'sugar', 'salt', 'spice', 'fruit juice',
    'oven', 'borek', 'dessert', 'ice cream', 'pizza', 'hamburger', 'sausage', 'doner',
    'milk dessert', 'tiramisu', 'recipe', 'fishrecipe', 'recipei', 'recipes', 'recipewith',
    'cupcake', 'brownie', 'pancake', 'waffle', 'bagel', 'sushi', 'taco', 'burrito',
    'guacamole', 'shawarma', 'falafel', 'steak', 'lamb'
]

# 4.2. Additional Numeric Features: average likes & comments
def mean_likes(posts):
    # If each post has something like: post['edge_liked_by']['count']
    if not isinstance(posts, list) or len(posts) == 0:
        return 0
    likes = []
    for p in posts:
        try:
            likes.append(p['edge_liked_by']['count'])
        except:
            likes.append(0)
    return np.mean(likes) if len(likes) > 0 else 0

def mean_comments(posts):
    # If each post has something like: post['edge_media_to_comment']['count']
    if not isinstance(posts, list) or len(posts) == 0:
        return 0
    comments = []
    for p in posts:
        try:
            comments.append(p['edge_media_to_comment']['count'])
        except:
            comments.append(0)
    return np.mean(comments) if len(comments) > 0 else 0

merged_data['like_mean'] = merged_data['posts'].apply(mean_likes)
merged_data['comment_mean'] = merged_data['posts'].apply(mean_comments)

# 4.3. Create 'food_word_count'
merged_data['food_word_count'] = merged_data['posts'].apply(lambda x: count_food_words_exact(x, food_words))

# 🟢 **Integration End**

print("\nSample Data with 'food_word_count', 'like_mean', 'comment_mean':")
print(merged_data[['username', 'food_word_count', 'like_mean', 'comment_mean']].head())

# 5. Prepare Features (X) and Target (y) for Binary Classification
merged_data['binary_label'] = merged_data['label'].apply(lambda x: 1 if x == category_mapping['Food'] else 0)

# Combine all captions into a single text per user for TF-IDF
merged_data['all_captions'] = merged_data['posts'].apply(
    lambda posts: ' '.join([post.get('caption', '') for post in posts if isinstance(post.get('caption', ''), str)])
)
merged_data['all_captions'] = merged_data['all_captions'].apply(preprocess_caption)

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(merged_data['all_captions'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out(), index=merged_data.index)

# Combine numeric + TF-IDF features
X = pd.concat([
    merged_data[['followers_count', 'following_count', 'media_count', 'num_posts',
                 'food_word_count', 'like_mean', 'comment_mean']],  # Enhanced numeric features
    tfidf_df
], axis=1)
y = merged_data['binary_label']

# 6. Feature Scaling
scaler = StandardScaler()
numeric_features = [
    'followers_count', 'following_count', 'media_count', 'num_posts',
    'food_word_count', 'like_mean', 'comment_mean'
]
X[numeric_features] = scaler.fit_transform(X[numeric_features])

# --------------------------------------------------------------------
# 7. Train-Test Split (Fixed with test-classification-round2.dat)
# --------------------------------------------------------------------
test_usernames = pd.read_csv(test_path, header=None)[0].tolist()

test_data = merged_data[merged_data['username'].isin(test_usernames)].copy()
train_data = merged_data[~merged_data['username'].isin(test_usernames)].copy()

# Sort test data by the order in test_usernames
test_data['username'] = pd.Categorical(test_data['username'], categories=test_usernames, ordered=True)
test_data.sort_values('username', inplace=True)

# Split the feature matrices
X_train = X.loc[train_data.index]
y_train = y.loc[train_data.index]

X_test = X.loc[test_data.index]
y_test = y.loc[test_data.index]

print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

# --------------------------------------------------------------------
# 8. RandomizedSearchCV for Hyperparameter Tuning
# --------------------------------------------------------------------
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None] + list(range(10, 51, 10)),
    'min_samples_split': randint(2, 16),
    'min_samples_leaf': randint(1, 7),
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings to sample
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Best Parameters from RandomizedSearchCV:", random_search.best_params_)
print("Best CV Score from RandomizedSearchCV:", random_search.best_score_)

# 9. Use the best estimator
best_model = random_search.best_estimator_
print("\nModel training with best parameters (RandomizedSearchCV) completed.")

# 10. Make Predictions
y_pred_test = best_model.predict(X_test)

# 11. Evaluate the Model
print("\nTest Set Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, target_names=['Not Food', 'Food']))

# 12. Prepare Output (Binary Classification)
usernames_test = test_data['username'].tolist()  # Preserves test file order
predicted_labels = ['Food' if label == 1 else 'Not Food' for label in y_pred_test]

output = {
    str(usernames_test[i]): str(predicted_labels[i]) for i in range(len(usernames_test))
}

# Optional: Save the output
with open('predictions.json', 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=4)

print("\nPredictions saved to 'predictions.json'.")