In [2]:

import numpy as np
import pandas as pd
from datetime import datetime
import re
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
import json
import gzip


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
# Load training data from files
train_classification_df = pd.read_csv("/content/train-classification.csv")
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

# Unify labels to lowercase
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

# Load training dataset
# Load training data
train_data_path = "/content/drive/MyDrive/released_dataset/training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()
username2posts_test = dict()
username2profile_test = dict()

# Continuing with data loading
with gzip.open(train_data_path, "rt") as fh:
   for line in fh:
       sample = json.loads(line)
       profile = sample["profile"]
       username = profile["username"]

       if username in username2_category:
           username2posts_train[username] = sample["posts"]
           username2profile_train[username] = profile
       else:
           username2posts_test[username] = sample["posts"]
           username2profile_test[username] = profile



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
# First, let's just try a simple prediction approach
def extract_basic_features(post, username, username2posts_train):
    # Get historical posts for this user from training data
    historical_posts = username2posts_train.get(username, [])

    # Calculate basic user statistics
    historical_likes = [p.get('like_count', 0) or 0 for p in historical_posts]

    if historical_likes:
        median_likes = np.median(historical_likes)
        mean_likes = np.mean(historical_likes)
        std_likes = np.std(historical_likes) if len(historical_likes) > 1 else 1
    else:
        # If no historical data, use global statistics
        all_likes = []
        for posts in username2posts_train.values():
            all_likes.extend([p.get('like_count', 0) or 0 for p in posts])
        median_likes = np.median(all_likes)
        mean_likes = np.mean(all_likes)
        std_likes = np.std(all_likes)

    features = {
        'user_median_likes': median_likes,
        'user_mean_likes': mean_likes,
        'user_std_likes': std_likes,
        'comment_count': post.get('comments_count', 0) or 0,
        'caption_length': len(post.get('caption', '') or ''),
        'media_type': post.get('media_type', 'unknown')
    }

    return features


In [32]:
def train_model(username2posts_train):
    # Prepare training data
    print("Preparing training data...")
    train_features = []
    train_targets = []

    for username, posts in username2posts_train.items():
        for post in posts:
            features = extract_basic_features(post, username, username2posts_train)
            train_features.append(features)
            train_targets.append(post.get('like_count', 0) or 0)

    X = pd.DataFrame(train_features)
    y = np.array(train_targets)

    # Handle categorical variables
    le = LabelEncoder()
    all_media_types = list(X['media_type'].unique())
    if 'unknown' not in all_media_types:
        all_media_types.append('unknown')
    le.fit(all_media_types)
    X['media_type'] = le.transform(X['media_type'])

    # Scale numerical features
    numerical_features = [col for col in X.columns if col != 'media_type']
    scaler = StandardScaler()
    X[numerical_features] = scaler.fit_transform(X[numerical_features])

    # Log transform target
    y_log = np.log10(y + 1)

    # Model
    model = lgb.LGBMRegressor(
        objective='regression',
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=63,
        feature_fraction=0.9,
        subsample=0.8,
        min_child_samples=20,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )

    print("Training model...")
    model.fit(X, y_log)

    return model, scaler, le, numerical_features


In [29]:
def predict_batch(model, scaler, le, numerical_features, username2posts):
    X_test, y_true = extract_basic_features(username2posts)

    # Transform features
    X_test['media_type'] = le.transform(X_test['media_type'])
    X_test[numerical_features] = scaler.transform(X_test[numerical_features])

    # Predict and ensure no negative values
    y_pred_log = model.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    # Post-process predictions
    y_pred = np.maximum(0, y_pred)  # Ensure no negative values

    # Handle very low predictions as potential zeros
    zero_threshold = 0.5
    y_pred[y_pred < zero_threshold] = 0

    return y_pred, y_true



In [30]:
def log_mse_like_counts(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    print("Min true value:", np.min(y_true))
    print("Max true value:", np.max(y_true))
    print("Min pred value:", np.min(y_pred))
    print("Max pred value:", np.max(y_pred))
    print("Number of zeros in true:", np.sum(y_true == 0))
    print("Number of zeros in pred:", np.sum(y_pred == 0))

    log_y_true = np.log10(y_true + 1)  # Add 1 to handle zeros
    log_y_pred = np.log10(y_pred + 1)  # Add 1 to handle zeros

    return np.mean((log_y_true - log_y_pred) ** 2)

In [33]:
# Load data and train model
print("Training model...")
model, scaler, le, numerical_features = train_model(username2posts_train)

Training model...
Preparing training data...
Training model...


In [34]:
# Evaluate training performance
print("\nEvaluating training performance...")
train_features = []
train_targets = []

for username, posts in username2posts_train.items():
    for post in posts:
        features = extract_basic_features(post, username, username2posts_train)
        train_features.append(features)
        train_targets.append(post.get('like_count', 0) or 0)

X_train = pd.DataFrame(train_features)
X_train['media_type'] = le.transform(X_train['media_type'])
X_train[numerical_features] = scaler.transform(X_train[numerical_features])

y_train_pred_log = model.predict(X_train)
y_train_pred = np.power(10, y_train_pred_log) - 1
y_train_pred = np.maximum(0, y_train_pred)

train_score = log_mse_like_counts(train_targets, y_train_pred)
print(f"Training Log MSE: {train_score}")



Evaluating training performance...
Min true value: 0.0
Max true value: 4246897.0
Min pred value: 0.0
Max pred value: 1327989.5824801642
Number of zeros in true: 3745
Number of zeros in pred: 534
Training Log MSE: 0.07845384875325681


In [39]:
# Read and process test data

#@title Test Dataset
path = "/content/test-regression-round3.jsonl"
output_path = "/content/prediction-regression-round.json"

test_data = []
test_features = []

print("\nProcessing test data...")
with open(path, "rt") as fh:
    for line in fh:
        post = json.loads(line)
        test_data.append(post)
        features = extract_basic_features(post, post['username'], username2posts_train)
        test_features.append(features)

print("\nFeature statistics before scaling:")
print(pd.DataFrame(test_features).describe())

# Convert to DataFrame and transform features
X_test = pd.DataFrame(test_features)
X_test['media_type'] = le.transform(X_test['media_type'])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

print("\nFeature statistics after scaling:")
print(X_test.describe())

# Make predictions
y_pred_log = model.predict(X_test)
y_pred = np.power(10, y_pred_log) - 1
y_pred = np.maximum(0, y_pred)

predictions = []
for post, pred in zip(test_data, y_pred):
    predictions.append({
        "id": post["id"],
        "username": post["username"],
        "like_count": int(pred)
    })

with open(output_path, "wt") as of:
    of.write("[\n")
    for i, pred in enumerate(predictions):
        of.write("  {\n")  
        of.write(f'    "id": "{pred["id"]}",\n')
        of.write(f'    "username": "{pred["username"]}",\n')
        of.write(f'    "like_count": {pred["like_count"]}\n')  
        of.write("  }")  

        if i < len(predictions) - 1:
            of.write(",\n")
        else:
            of.write("\n")

    of.write("]\n")  # End array

print("Done!")




Processing test data...

Feature statistics before scaling:
       user_median_likes  user_mean_likes  user_std_likes  comment_count  \
count        3000.000000      3000.000000     3000.000000    3000.000000   
mean         2623.132833      6131.153860    24306.333026      87.065667   
std         22282.271194     25307.478963    26420.180346     751.826755   
min             0.000000         0.000000        0.000000       0.000000   
25%            56.750000        77.614286       54.008862       0.000000   
50%            61.000000      5840.381602    43763.476046       1.000000   
75%            61.000000      5840.381602    43763.476046       8.000000   
max        762120.000000    882625.171429   479962.962664   26433.000000   

       caption_length  
count     3000.000000  
mean       297.234000  
std        329.229836  
min          0.000000  
25%         90.000000  
50%        203.000000  
75%        379.000000  
max       2197.000000  

Feature statistics after scaling:
   

In [38]:
# Print prediction statistics
print("\nPrediction Statistics:")
print(f"Number of predictions: {len(y_pred)}")
print(f"Mean prediction: {np.mean(y_pred):.2f}")
print(f"Median prediction: {np.median(y_pred):.2f}")
print(f"Number of zeros: {np.sum(y_pred == 0)}")
print(f"Max prediction: {np.max(y_pred):.2f}")


Prediction Statistics:
Number of predictions: 3000
Mean prediction: 2976.77
Median prediction: 36.46
Number of zeros: 7
Max prediction: 768326.54
