In [23]:
import numpy as np
import pandas as pd
import gzip
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
import re
import nltk
from nltk.corpus import stopwords

In [7]:
# Data loading
train_classification_df = pd.read_csv(r"C:\Users\irmak\OneDrive\Masaüstü\412p\train-classification.csv")
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

# Unifying labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]


In [8]:
# uploas JSONL data 
train_data_path = r"C:\Users\irmak\OneDrive\Masaüstü\412p\training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()
username2posts_test = dict()
username2profile_test = dict()

with gzip.open(train_data_path, "rt") as fh:
    for line in fh:
        sample = json.loads(line)
        profile = sample["profile"]
        username = profile["username"]
        if username in username2_category:
            username2posts_train[username] = sample["posts"]
            username2profile_train[username] = profile
        else:
            username2posts_test[username] = sample["posts"]
            username2profile_test[username] = profile

In [9]:
def preprocess_text(text: str):
    if not isinstance(text, str):
        return ''
    
    # Preserve emojis - these might be important classification markers
    # Replace URLs with a special token
    text = re.sub(r'http\S+|www\S+|https\S+', '[URL]', text, flags=re.MULTILINE)
    
    # Preserve hashtags but remove the '#' symbol
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Replace mentions with a special token
    text = re.sub(r'@\w+', '[MENTION]', text)
    
    # Preserve numbers
    text = text.lower()
    
    return text


In [10]:
def extract_profile_features(username2profile):
    features = []
    for username, profile in username2profile.items():
        feature_dict = {
            'follower_count': profile.get('follower_count', 0),
            'following_count': profile.get('following_count', 0),
            'is_business_account': int(profile.get('is_business_account', False)),
            'is_private': int(profile.get('is_private', False)),
            'is_verified': int(profile.get('is_verified', False)),
            'follower_following_ratio': profile.get('follower_count', 0) / (profile.get('following_count', 1) + 1),
        }
        features.append(feature_dict)
    return pd.DataFrame(features)

In [11]:
# Prepare post texts
def prepare_post_texts(username2posts):
    texts = []
    for username, posts in username2posts.items():
        user_texts = []
        for post in posts:
            caption = post.get('caption', '')
            if caption:
                # Clean the caption text using the preprocessing function
                cleaned_caption = preprocess_text(caption)
                if cleaned_caption:
                    user_texts.append(cleaned_caption)
        # Combine all cleaned captions of a user into a single string
        texts.append(' '.join(user_texts))
    return texts


In [12]:
# Importing Turkish StopWords
import nltk
from nltk.corpus import stopwords

# Download the stopwords list from NLTK
nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\irmak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
post_texts_train = prepare_post_texts(username2posts_train)
vectorizer = TfidfVectorizer(
    stop_words=turkish_stopwords,
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)
x_post_train = vectorizer.fit_transform(post_texts_train)

In [14]:
# Profile features
profile_features_train = extract_profile_features(username2profile_train)
scaler = StandardScaler()
profile_features_scaled = scaler.fit_transform(profile_features_train)

# Combine features
x_train_combined = np.hstack([
    x_post_train.toarray(),
    profile_features_scaled
])


In [15]:
# Retrieve usernames for the training data
train_usernames = list(username2posts_train.keys())

# Label encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform([username2_category.get(uname, "NA") for uname in train_usernames])


In [16]:
# Train-validation split
x_train, x_val, y_train_split, y_val = train_test_split(
    x_train_combined,
    y_train_encoded,
    test_size=0.2,
    stratify=y_train_encoded,
    random_state=42
)

In [17]:
# apply SMOTE
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train_split)

In [None]:
models = [
    ('lr', LogisticRegression(
        max_iter=1000,
        multi_class='multinomial',
        random_state=42
    )),
    ('svc', SVC(
        probability=True,  # for Soft voting probability=True
        kernel='linear',   
        random_state=42,
        max_iter=1000
    )),
    ('xgb', XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=7,
        objective='multi:softprob',
        random_state=42
    ))
]


In [25]:
# Voting Classifier
ensemble = VotingClassifier(
    estimators=models,
    voting='soft'  # soft voting
)

# Model training
ensemble.fit(x_train_resampled, y_train_resampled)



In [26]:
#  Evaluate Training Set Performance (only accuracy)
print("\nTraining Set Performance:")
y_train_pred = ensemble.predict(x_train)
train_acc = accuracy_score(y_train_split, y_train_pred)
print(f"Accuracy: {train_acc:.4f}")


Training Set Performance:
Accuracy: 0.9827


In [27]:
#Evaluate Validation Set Performance
print("\nValidation Set Performance:")
y_val_pred = ensemble.predict(x_val)
val_acc = accuracy_score(y_val, y_val_pred)
print(f"Accuracy: {val_acc:.4f}")

print("\nDetailed Report for Validation Set:")
print(
    classification_report(
        label_encoder.inverse_transform(y_val),
        label_encoder.inverse_transform(y_val_pred)
    )
)


Validation Set Performance:
Accuracy: 0.6521

Detailed Report for Validation Set:
                      precision    recall  f1-score   support

                 art       0.23      0.08      0.12        38
       entertainment       0.45      0.40      0.42        65
             fashion       0.53      0.72      0.61        60
                food       0.86      0.87      0.87       102
              gaming       0.00      0.00      0.00         3
health and lifestyle       0.62      0.79      0.70       100
    mom and children       0.71      0.40      0.51        30
              sports       0.80      0.70      0.74        23
                tech       0.71      0.75      0.73        69
              travel       0.67      0.64      0.66        59

            accuracy                           0.65       549
           macro avg       0.56      0.54      0.54       549
        weighted avg       0.63      0.65      0.63       549



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
# Preprocess test data to create 'test_usernames' and 'test_corpus'
test_corpus = prepare_post_texts(username2posts_test)
x_post_test = vectorizer.transform(test_corpus) 

profile_features_test_df = extract_profile_features(username2profile_test)  
profile_features_test_scaled = scaler.transform(profile_features_test_df)  

x_test_full = np.hstack([
    x_post_test.toarray(), 
    profile_features_test_scaled
])

test_usernames = list(username2posts_test.keys())


In [38]:
import json
from collections import Counter
import numpy as np

# 2a) Read the usernames from the file 'test-classification-round3.dat'
test_data_path = r"C:\Users\irmak\OneDrive\Masaüstü\412p\test-classification-round3.dat"
test_unames = []
with open(test_data_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        test_unames.append(line.strip())

# 2b) Find the corresponding feature vector for each username
x_test = []
for uname in test_unames:
    try:
        # Check if the username is in the test set
        idx = test_usernames.index(uname)
        row_vector = x_test_full[idx]  # numpy array (1D)
        x_test.append(row_vector)
    except ValueError:
        # If not in the test set, check the training set
        try:
            idx = train_usernames.index(uname)
            row_vector = x_train_combined[idx]
            x_test.append(row_vector)
        except ValueError:
            # If not in either set, append a zero vector
            zero_vec = np.zeros((1, x_test_full.shape[1]))
            x_test.append(zero_vec[0])  # Add as 1D array using [0]

# 2c) Convert the list of arrays into a single 2D array
x_test = np.vstack(x_test)
print("x_test.shape:", x_test.shape)


x_test.shape: (1000, 5006)


In [39]:
# 3a) Make predictions using the model
test_pred = ensemble.predict(x_test)
test_pred_categories = label_encoder.inverse_transform(test_pred)

# 3b) Collect prediction results in a dictionary
output = {uname: category for uname, category in zip(test_unames, test_pred_categories)}

# 3c) Display results on the screen
print("\nPredictions:")
print("-" * 50)
for uname, category in output.items():
    print(f"{uname}: {category}")

# 3d) Display the distribution of predictions
prediction_counts = Counter(test_pred_categories)
print("\nPrediction Distribution:")
print("-" * 50)
for category, count in prediction_counts.most_common():
    print(f"{category}: {count}")

# 3e) Save predictions as a JSON file
output_json_path = r"C:\Users\irmak\OneDrive\Masaüstü\412p\prediction-classification-round3.json"
with open(output_json_path, "w", encoding='utf-8') as json_file:
    json.dump(output, json_file, ensure_ascii=False, indent=4)

print(f"\nPredictions have been successfully saved to '{output_json_path}'.")



Predictions:
--------------------------------------------------
livapastanesi: food
barisgross: food
tusasshop: fashion
etolyadigital: tech
tugrulonur: entertainment
tulugozlu: health and lifestyle
gokidy: mom and children
cengizgumus_official: fashion
krossbisiklet: sports
haribochamallows: food
ozatashipyard: travel
yenisafak: entertainment
iamsiddeshjadhav: art
burcinterzioglu: entertainment
steakhousegunaydin: food
baselifeclub: health and lifestyle
benismailyildirimm: health and lifestyle
imuneksfarma: health and lifestyle
dogakoyucatalca: food
sena.sener: fashion
kandilliborsarestaurant: food
selamiersoyy: art
deutz_fahr_turkey: food
cevaheer: fashion
tezatsanat: entertainment
filtresizcom: art
palomamarina_suites: travel
westchocolatemarina: food
sebnemcapa: travel
rozetsepeti: entertainment
ececesmioglu: fashion
ustapidecitr: food
gocaagonyali: health and lifestyle
maestro.sanat.kursu: art
oztayteksofficial: fashion
imtolstoyevski: entertainment
turkervip: travel
rustik.rus.re