In [2]:
import random
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
import os
from sklearn.ensemble import RandomForestRegressor
import warnings


# Download Turkish stopwords
nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

dataframe = pd.read_json('training-dataset.jsonl', lines=True)
df = pd.json_normalize(dataframe['profile'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\serha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:





df = df.drop('is_private', axis=1)
df = df.drop('profile_pic_url', axis=1)
df = df.drop('profile_picture_base64', axis=1)
df = df.drop('business_phone_number', axis=1)
df = df.drop('eimu_id', axis=1)
df = df.drop('fbid', axis=1)
df = df.drop('fb_profile_biolink', axis=1)
df = df.drop('id', axis=1)
df = df.drop('is_professional_account', axis=1)
df = df.drop('ai_agent_type', axis=1)
df = df.drop('restricted_by_viewer', axis=1)
df = df.drop('business_email', axis=1)
df = df.drop('is_regulated_c18', axis=1)
df = df.drop('entities', axis=1)
df = df.drop('overall_category_name', axis=1)


uniform_columns = []

for column in df.columns:
    if df[column].nunique() == 1:
        uniform_columns.append(column)
for column in uniform_columns:
    df = df.drop(column, axis=1)


df['external_url_bool'] = df['external_url'].apply(lambda x: x is not None)
df = df.drop('external_url', axis=1)

df['contact_call'] = df['business_contact_method'] == 'CALL'
df['contact_unknown'] = df['business_contact_method'] == 'UNKNOWN'
df['contact_text'] = df['business_contact_method'] == 'TEXT'
df = df.drop('business_contact_method', axis=1)
df = df.drop('bio_links', axis=1)
df = df.drop('business_address_json', axis=1) 
df = df.drop('category_enum', axis=1) 
columns_to_combine = ['full_name','biography', 'category_name', 'business_category_name']  
df['combined_text'] = df[columns_to_combine].fillna('').agg(' '.join, axis=1)
df = df.drop('full_name', axis=1)
df = df.drop('biography', axis=1)
df = df.drop('category_name', axis=1)
df = df.drop('business_category_name', axis=1)

In [4]:
# Burayı yapmana gerek yok
# Define your custom vocabulary
custom_vocab = ['#example', '🙂', 'özel', 'kelime']  # Replace with your custom words

# Define the preprocessing function
def preprocess_text(text: str):
    # Lowercase Turkish text using casefold
    text = text.casefold()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the 'combined_text' column
df['combined_text'] = df['combined_text'].apply(preprocess_text)

# Initialize TfidfVectorizer with Turkish stopwords and max features
vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=2000)

# Fit and transform the 'combined_text' column
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# Get feature names (the dynamically selected top words)
dynamic_features = vectorizer.get_feature_names_out()

# Combine dynamic features with custom vocabulary, ensuring no duplicates
final_vocab = list(set(dynamic_features).union(custom_vocab))

# Reinitialize TfidfVectorizer with the combined vocabulary
final_vectorizer = TfidfVectorizer(vocabulary=final_vocab)

# Transform the text again with the combined vocabulary
final_tfidf_matrix = final_vectorizer.fit_transform(df['combined_text'])

# Convert to DataFrame for readability
tfidf_df = pd.DataFrame(final_tfidf_matrix.toarray(), columns=final_vectorizer.get_feature_names_out())


df = pd.concat([df, tfidf_df], axis=1)
df = df.drop('combined_text', axis=1)


In [5]:

#burayı yapmana gerek yok
# Load and preprocess data (same as before)
labels_data = pd.read_csv('train-classification.csv', header=None, names=['username', 'Category'])
labels_data['Category'] = labels_data['Category'].str.lower()
merged_data = df.merge(labels_data, left_on='username', right_on='username')

X = merged_data.drop(columns=['username', 'Category'])
y = merged_data['Category']

# Convert non-numeric features to numeric
X = pd.get_dummies(X, drop_first=True)
X = X.fillna(0)

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Balance classes using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Scale features for better optimization
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Train SGDClassifier with class weights
clf = SGDClassifier(random_state=42, max_iter=1000, tol=1e-3, class_weight='balanced')
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Print results
print("Accuracy:", accuracy_score(y_test_decoded, y_pred_decoded))
print("Classification Report:\n", classification_report(y_test_decoded, y_pred_decoded))

# In other rounds, maybe try implementing the posts texts in the df for this and see if it improves.


Accuracy: 0.8228962818003914
Classification Report:
                       precision    recall  f1-score   support

                 art       0.86      0.94      0.90       102
       entertainment       0.64      0.75      0.69       102
             fashion       0.83      0.84      0.83       102
                food       0.74      0.72      0.73       103
              gaming       0.98      1.00      0.99       102
health and lifestyle       0.75      0.50      0.60       102
    mom and children       0.92      0.94      0.93       103
              sports       0.95      0.97      0.96       102
                tech       0.78      0.77      0.78       102
              travel       0.78      0.79      0.79       102

            accuracy                           0.82      1022
           macro avg       0.82      0.82      0.82      1022
        weighted avg       0.82      0.82      0.82      1022



In [6]:
#burdan başlıyor
df_posts = pd.json_normalize(dataframe['posts'])

def unpack_nested_column(df, column_name):
    unpacked_df = pd.json_normalize(df[column_name])
    unpacked_df.columns = [f"{column_name}_{subcol}" for subcol in unpacked_df.columns]
    return unpacked_df

unpacked_dfs = []
for col in df_posts.columns:
    if isinstance(df_posts[col].iloc[0], dict):
        unpacked_df = unpack_nested_column(df_posts, col)
        df_posts = df_posts.drop(columns=[col]).join(unpacked_df)
    else:
        unpacked_dfs.append(df_posts[[col]])

if unpacked_dfs:
    non_nested_df = pd.concat(unpacked_dfs, axis=1)
    result_df = pd.concat([non_nested_df, df_posts], axis=1)
else:
    result_df = df_posts

df_posts = result_df
df_posts = df_posts.drop(columns=[col for col in df_posts.columns if 'media_url' in col])
for i in range(35):
    timestamp_col = f'{i}_timestamp'
    if timestamp_col in df_posts.columns:
        df_posts[timestamp_col] = pd.to_datetime(df_posts[timestamp_col], format='%Y-%m-%d %H:%M:%S')
        df_posts[f'{i}_yearPost'] = df_posts[timestamp_col].dt.year
        df_posts[f'{i}_monthPost'] = df_posts[timestamp_col].dt.month
        df_posts = df_posts.drop(columns=[timestamp_col])


  df_posts[f'{i}_monthPost'] = df_posts[timestamp_col].dt.month
  df_posts[f'{i}_yearPost'] = df_posts[timestamp_col].dt.year
  df_posts[f'{i}_monthPost'] = df_posts[timestamp_col].dt.month
  df_posts[f'{i}_yearPost'] = df_posts[timestamp_col].dt.year
  df_posts[f'{i}_monthPost'] = df_posts[timestamp_col].dt.month
  df_posts[f'{i}_yearPost'] = df_posts[timestamp_col].dt.year
  df_posts[f'{i}_monthPost'] = df_posts[timestamp_col].dt.month


In [7]:
for i in range(35):
    timestamp_col = f'{i}_timestamp'
    year_col = f'{i}_yearPost'
    month_col = f'{i}_monthPost'

    if timestamp_col in df_posts.columns:
        df_posts[timestamp_col] = pd.to_datetime(df_posts[timestamp_col], format='%Y-%m-%d %H:%M:%S')
        df_posts[year_col] = df_posts[timestamp_col].dt.year
        df_posts[month_col] = df_posts[timestamp_col].dt.month
        df_posts = df_posts.drop(columns=[timestamp_col])

new_column_order = []

for i in range(35):
    caption_col = f'{i}_caption'
    id_col = f'{i}_id'
    comments_count_col = f'{i}_comments_count'
    like_count_col = f'{i}_like_count'
    media_type_col = f'{i}_media_type'
    year_col = f'{i}_yearPost'
    month_col = f'{i}_monthPost'
    
    if caption_col in df_posts.columns:
        new_column_order.append(caption_col)
    if id_col in df_posts.columns:
        new_column_order.append(id_col)
    if comments_count_col in df_posts.columns:
        new_column_order.append(comments_count_col)
    if like_count_col in df_posts.columns:
        new_column_order.append(like_count_col)
    if media_type_col in df_posts.columns:
        new_column_order.append(media_type_col)
    if year_col in df_posts.columns:
        new_column_order.append(year_col)
    if month_col in df_posts.columns:
        new_column_order.append(month_col)

remaining_columns = [col for col in df_posts.columns if col not in new_column_order]
new_column_order.extend(remaining_columns)

df_posts = df_posts[new_column_order]


In [8]:


# Preprocess text function
def preprocess_text(text: str):
    text = text.casefold()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Ensure nltk stopwords are downloaded
nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

# Directory to save TF-IDF vectorizers
tfidf_directory = 'tfidf_vectorizers'
os.makedirs(tfidf_directory, exist_ok=True)

# Function to transform media_types into separate columns and preprocess captions
def transform_and_preprocess(df, username):
    # Convert media_types to separate columns
    df['media_video'] = df['media_type'] == 'VIDEO'
    df['media_image'] = df['media_type'] == 'IMAGE'
    df['media_album'] = df['media_type'] == 'CAROUSEL_ALBUM'
    df = df.drop(columns=['media_type'])
    
    # Preprocess caption text and filter out empty captions
    df['caption'] = df['caption'].apply(preprocess_text)
    df = df[df['caption'].str.strip() != '']
    
    # Check if there are any non-empty captions left after preprocessing
    if df['caption'].empty or all(df['caption'].apply(lambda x: len(x.split())) == 0):
        return df.drop(columns=['caption'])
    
    try:
        # Apply TF-IDF to the non-empty captions
        tfidf = TfidfVectorizer(max_features=150, stop_words=turkish_stopwords)
        tfidf_matrix = tfidf.fit_transform(df['caption'])
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[word for word in tfidf.get_feature_names_out()])
        
        # Save the TF-IDF vectorizer to a file
        with open(os.path.join(tfidf_directory, f'{username}_tfidf_vectorizer.pkl'), 'wb') as file:
            pickle.dump(tfidf, file)
        
        # Combine TF-IDF features with the original DataFrame
        df = df.reset_index(drop=True)
        tfidf_df = tfidf_df.reset_index(drop=True)
        df = pd.concat([df, tfidf_df], axis=1)
        df = df.drop(columns=['caption'])
    except ValueError as e:
        print(f"TF-IDF Error: {e} - Skipping TF-IDF for this DataFrame")
        df = df.drop(columns=['caption'])
    
    return df

# Function to create user post dataframes
def create_user_post_dataframes(df, df_posts):
    user_dataframes = {}  # Dictionary to store dataframes for each username
    
    for index, row in df.iterrows():
        username = row['username']  # Get the username for this row
        
        # Extract all columns related to this user from df_posts
        user_post_data = {
            'caption': [],
            'id': [],
            'comments_count': [],
            'like_count': [],
            'media_type': [],
            'yearPost': [],  # Changed to yearPost
            'monthPost': []  # Changed to monthPost
        }
        
        for i in range(34):  # Assuming up to 34 posts
            caption_col = f'{i}_caption'
            id_col = f'{i}_id'
            comments_count_col = f'{i}_comments_count'
            like_count_col = f'{i}_like_count'
            media_type_col = f'{i}_media_type'
            yearPost_col = f'{i}_yearPost'
            monthPost_col = f'{i}_monthPost'
            
            # Ensure all required columns exist in df_posts
            if caption_col in df_posts.columns and id_col in df_posts.columns and comments_count_col in df_posts.columns and like_count_col in df_posts.columns and media_type_col in df_posts.columns and yearPost_col in df_posts.columns and monthPost_col in df_posts.columns:
                # Add the values for this post
                caption = df_posts.loc[index, caption_col]
                post_id = df_posts.loc[index, id_col]
                comments_count = df_posts.loc[index, comments_count_col]
                like_count = df_posts.loc[index, like_count_col]
                media_type = df_posts.loc[index, media_type_col]
                yearPost = df_posts.loc[index, yearPost_col]
                monthPost = df_posts.loc[index, monthPost_col]
                
                # Check if the row is not empty
                if pd.notna(caption) and pd.notna(post_id) and pd.notna(comments_count) and pd.notna(like_count) and pd.notna(media_type) and pd.notna(yearPost) and pd.notna(monthPost):
                    user_post_data['caption'].append(caption)
                    user_post_data['id'].append(post_id)
                    user_post_data['comments_count'].append(comments_count)
                    user_post_data['like_count'].append(like_count)
                    user_post_data['media_type'].append(media_type)
                    user_post_data['yearPost'].append(yearPost)  # Append to yearPost
                    user_post_data['monthPost'].append(monthPost)  # Append to monthPost

        # Create a dataframe for this user with the extracted data
        user_df = pd.DataFrame(user_post_data)
        
        # Transform media_types and preprocess captions
        if not user_df.empty:
            user_df = transform_and_preprocess(user_df, username)
        
        # Save it to the dictionary if it's not empty
        if not user_df.empty:
            user_dataframes[username] = user_df
    
    return user_dataframes

# Usage
user_dataframes = create_user_post_dataframes(df, df_posts)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\serha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TF-IDF Error: empty vocabulary; perhaps the documents only contain stop words - Skipping TF-IDF for this DataFrame
TF-IDF Error: empty vocabulary; perhaps the documents only contain stop words - Skipping TF-IDF for this DataFrame


In [9]:
input_df = pd.read_json('test-regression-round3.jsonl', lines=True)
input_df['timestamp'] = pd.to_datetime(input_df['timestamp'])
input_df['yearPost'] = input_df['timestamp'].dt.year
input_df['monthPost'] = input_df['timestamp'].dt.month
input_df = input_df.drop(columns=['timestamp'])
input_df = input_df.drop(columns=['media_url'])
input_df['media_video'] = input_df['media_type'] == 'VIDEO'
input_df['media_image'] = input_df['media_type'] == 'IMAGE'
input_df['media_album'] = input_df['media_type'] == 'CAROUSEL_ALBUM'
input_df = input_df.drop(columns=['media_type'])
input_df['like_count'] = float('nan')  
input_df = input_df[['id', 'comments_count', 'like_count', 'yearPost', 'monthPost', 'media_video', 'media_image', 'media_album', 'username', 'caption']]
input_df['caption'] = input_df.apply(lambda row: row['username'] if pd.isna(row['caption']) else row['caption'], axis=1)

In [12]:
#final hali

import pandas as pd
import numpy as np
import os
import pickle
import warnings
import json
from sklearn.ensemble import RandomForestRegressor

# Load test data with exact ID handling
test_df = pd.read_json('test-regression-round3.jsonl', lines=True, dtype={'id': str})

# Process timestamp for test data
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
test_df['yearPost'] = test_df['timestamp'].dt.year
test_df['monthPost'] = test_df['timestamp'].dt.month
test_df = test_df.drop(columns=['timestamp'])
test_df = test_df.drop(columns=['media_url'])

# Process media types for test data
test_df['media_video'] = test_df['media_type'] == 'VIDEO'
test_df['media_image'] = test_df['media_type'] == 'IMAGE'
test_df['media_album'] = test_df['media_type'] == 'CAROUSEL_ALBUM'
test_df = test_df.drop(columns=['media_type'])

# Add NaN like_count column to test data
test_df['like_count'] = float('nan')

# Reorder columns to match training data
test_df = test_df[['id', 'comments_count', 'like_count', 'yearPost', 'monthPost', 
                   'media_video', 'media_image', 'media_album', 'username', 'caption']]

warnings.filterwarnings("ignore", category=FutureWarning)

# Create list to store predictions in order
predictions = []

# Process only test data rows
for i in range(len(test_df)):
    try:
        # Get the test row
        test_row = test_df.iloc[i]
        username = test_row['username']
        row_id = str(test_row['id'])  # Convert to string for exact matching
        caption = test_row['caption']

        tfidf_file_path = os.path.join(tfidf_directory, f'{username}_tfidf_vectorizer.pkl')

        # Skip if username is missing or TF-IDF file doesn't exist
        if not username or not os.path.exists(tfidf_file_path):
            predictions.append((row_id, 0))
            continue

        # Get only training data for this user
        training_df = user_dataframes.get(username)
        
        # Skip if no training data exists
        if training_df is None or training_df.empty:
            predictions.append((row_id, 0))
            continue

        # Skip if the training data doesn't have like_count values
        if 'like_count' not in training_df.columns or training_df['like_count'].isna().all():
            predictions.append((row_id, 0))
            continue

        # Only use rows with valid like_count for training
        training_df = training_df.dropna(subset=['like_count'])

        # Skip if insufficient training data
        if len(training_df) < 2:
            predictions.append((row_id, 0))
            continue

        # Load and apply TF-IDF transformation
        with open(tfidf_file_path, 'rb') as file:
            tfidf_vectorizer = pickle.load(file)

        # Transform test row
        if (caption):
            preprocessed_caption = preprocess_text(caption)
        
        
        tfidf_vector = tfidf_vectorizer.transform([preprocessed_caption])
        tfidf_df = pd.DataFrame(tfidf_vector.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
        
        # Prepare test features
        transformed_row = pd.concat([
            test_row.drop(labels=['caption']).to_frame().T.reset_index(drop=True),
            tfidf_df.reset_index(drop=True)
        ], axis=1)

        # Ensure column alignment with training data
        for col in training_df.columns:
            if col not in transformed_row.columns:
                transformed_row[col] = 0

        # Prepare features
        exclude_cols = {'like_count', 'id', 'username'}
        features = [col for col in training_df.columns if col not in exclude_cols]

        # Prepare training data
        train_features = training_df[features].fillna(0)
        train_target = training_df['like_count']

        # Prepare test features
        test_features = transformed_row[features].fillna(0)

        # Ensure all features are numeric
        train_features = train_features.apply(pd.to_numeric, errors='coerce').fillna(0)
        test_features = test_features.apply(pd.to_numeric, errors='coerce').fillna(0)

        # Align columns
        common_cols = list(set(train_features.columns) & set(test_features.columns))
        train_features = train_features[common_cols]
        test_features = test_features[common_cols]

        # Train and predict
        model = RandomForestRegressor(random_state=42)
        model.fit(train_features, train_target)
        predicted_like_count = int(round(max(0, model.predict(test_features)[0])))

        predictions.append((row_id, predicted_like_count))

    except Exception as e:
        predictions.append((row_id, 0))
        if "Input contains NaN" not in str(e):
            print(f"Error for row {row_id}: {str(e)}")

# Write predictions to JSONL file maintaining original order
with open('prediction-regression-round3.jsonl', 'w') as f:
    for i, (id_, like_count) in enumerate(predictions):
        line = f'    "{id_}": {like_count}'
        if i < len(predictions) - 1:
            line += ','
        f.write(line + '\n')

print("Predictions have been written to prediction-regression-round3.jsonl")

Predictions have been written to prediction-regression-round3.jsonl
