In [2]:
import numpy as np
import pandas as pd
import gzip
import json

from pprint import pprint

In [3]:
#@title Turkish StopWords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
train_classification_df = pd.read_csv("/content/train-classification.csv",)
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

# Unifying labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

In [5]:
# stats about the labels
train_classification_df.groupby("category").count()

Unnamed: 0_level_0,user_id
category,Unnamed: 1_level_1
art,191
entertainment,323
fashion,299
food,511
gaming,13
health and lifestyle,503
mom and children,149
sports,113
tech,346
travel,294


In [6]:
username2_category["kod8net"]

'tech'

In [7]:
train_data_path = "/content/training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()


with gzip.open(train_data_path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)

    profile = sample["profile"]
    username = profile["username"]
    if username in username2_category:
      # train data info
      username2posts_train[username] = sample["posts"]
      username2profile_train[username] = profile


    else:
      # it is test data info
      username2posts_test[username] = sample["posts"]
      username2profile_test[username] = profile


In [8]:
# Profile Dataframe
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

train_profile_df.head(2)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,Local business,,1167,192,True,False,...,,,LOCAL,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,kafesfirin,266439571,KAFES FIRIN,📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...,Brand,,11997,17,True,False,...,,,BRAND,False,False,https://instagram.fada1-13.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [9]:
test_profile_df.head(2)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,beyazyakaliyiz,8634457436,Selam Beyaz Yakalı,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,Personal blog,,1265,665,True,False,...,,,PERSONAL_BLOG,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,totalenergies_istasyonlari,7066643793,TotalEnergies İstasyonları,TotalEnergies İstasyonları resmi Instagram hes...,Energy Company,,28025,4,True,False,...,,,ENERGY_COMPANY,False,False,https://instagram.fsaw2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def preprocess_text(text: str):
    # lower casing Turkish Text, Don't use str.lower :)
    text = text.casefold()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and punctuation
    # HERE THE EMOJIS stuff are being removed, you may want to keep them :D
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


corpus = []

# to keep the label order
train_usernames = []

for username, posts in username2posts_train.items():
  train_usernames.append(username)

  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)


  # joining the posts of each user with a \n
  user_post_captions = "\n".join(cleaned_captions)
  corpus.append(user_post_captions)


vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=5000)

# fit the vectorizer
vectorizer.fit(corpus)


# transform the data into vectors
x_post_train = vectorizer.transform(corpus)
y_train = [username2_category.get(uname, "NA") for uname in train_usernames]


test_usernames = []
test_corpus = []
for username, posts in username2posts_test.items():
  test_usernames.append(username)
  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)

  user_post_captions = "\n".join(cleaned_captions)
  test_corpus.append(user_post_captions)


# Just transforming! No Fitting!!!!!
x_post_test = vectorizer.transform(test_corpus)

In [11]:
# Making sure everything is fine
assert y_train.count("NA") == 0

In [12]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['abdullah', 'abone', 'about', ..., 'şık', 'şıklık', 'şıklığı'],
      dtype=object)

In [13]:
df_tfidf = pd.DataFrame(x_post_train.toarray(), columns=feature_names)
df_tfidf.head(2)

Unnamed: 0,abdullah,abone,about,acele,acil,activities,acı,ad,ada,adam,...,şubemiz,şubesi,şölen,şöleni,şöyle,şükranla,şükür,şık,şıklık,şıklığı
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050596,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df_tfidf.shape

(2741, 5000)

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(df_tfidf, y_train, test_size=0.2, stratify=y_train)

In [16]:
x_train.shape

(2192, 5000)

In [17]:
x_val.shape

(549, 5000)

In [18]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


model = MultinomialNB()
model.fit(x_train, y_train)

In [19]:
#@title Train Data
y_train_pred = model.predict(x_train)

print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, zero_division=0))

Accuracy: 0.6747262773722628

Classification Report:
                      precision    recall  f1-score   support

                 art       0.90      0.12      0.22       153
       entertainment       0.64      0.60      0.62       258
             fashion       0.79      0.74      0.77       239
                food       0.82      0.91      0.86       409
              gaming       0.00      0.00      0.00        10
health and lifestyle       0.52      0.87      0.65       402
    mom and children       0.89      0.07      0.12       119
              sports       1.00      0.11      0.20        90
                tech       0.80      0.84      0.82       277
              travel       0.60      0.67      0.63       235

            accuracy                           0.67      2192
           macro avg       0.69      0.49      0.49      2192
        weighted avg       0.73      0.67      0.64      2192



In [20]:
#@title Validation Data
y_val_pred = model.predict(x_val)

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))

Accuracy: 0.5683060109289617

Classification Report:
                      precision    recall  f1-score   support

                 art       0.00      0.00      0.00        38
       entertainment       0.45      0.45      0.45        65
             fashion       0.73      0.58      0.65        60
                food       0.73      0.81      0.77       102
              gaming       0.00      0.00      0.00         3
health and lifestyle       0.42      0.72      0.53       100
    mom and children       1.00      0.07      0.12        30
              sports       1.00      0.04      0.08        23
                tech       0.67      0.77      0.72        69
              travel       0.55      0.63      0.59        59

            accuracy                           0.57       549
           macro avg       0.55      0.41      0.39       549
        weighted avg       0.58      0.57      0.53       549



In [21]:
#@title Test Data


# let's take a look at the first 5 lines of the file
test_data_path = "/content/test-classification-round1.dat"
!head -n 5 "$test_data_path"

print("*****")

test_unames = []
with open(test_data_path, "rt") as fh:
  for line in fh:
    test_unames.append(line.strip())

print(test_unames[:5])

ozhotelstr
elleturkiye
sozerinsaatorhangazi
sanliurfapiazzaavym
rusanozden
*****
['ozhotelstr', 'elleturkiye', 'sozerinsaatorhangazi', 'sanliurfapiazzaavym', 'rusanozden']


In [22]:
x_test = []

for uname in test_unames:
  try:
    index = test_usernames.index(uname)
    x_test.append(x_post_test[index].toarray()[0])
  except Exception as e:
    try:
      index = train_usernames.index(uname)
      x_test.append(x_post_train[index].toarray()[0])
    except Exception as e:
      print(uname)


test_unames.remove("screenname")

screenname


In [23]:
df_test = pd.DataFrame(np.array(x_test), columns=feature_names)
df_test.head(2)

Unnamed: 0,abdullah,abone,about,acele,acil,activities,acı,ad,ada,adam,...,şubemiz,şubesi,şölen,şöleni,şöyle,şükranla,şükür,şık,şıklık,şıklığı
0,0.0,0.0,0.0,0.0,0.0,0.013628,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.025994,0.0,0.0,0.011087,0.0,0.0


In [24]:
test_pred = model.predict(df_test)

output = dict()
for index, uname in enumerate(test_unames):
  output[uname] = test_pred[index]

In [25]:
with open("output.json", "w") as of:
  json.dump(output, of, indent=4)

In [26]:
def predict_like_count(username, current_post=None):
  def get_avg_like_count(posts:list):
    total = 0.
    for post in posts:
      if current_post is not None and post["id"] == current_post["id"]:
        continue

      like_count = post.get("like_count", 0)
      if like_count is None:
        like_count = 0
      total += like_count

    if len(posts) == 0:
      return 0.

    return total / len(posts)

  if username in username2posts_train:
    return get_avg_like_count(username2posts_train[username])
  elif username in username2posts_test:
    return get_avg_like_count(username2posts_test[username])
  else:
    print(f"No data available for {username}")
    return -1

In [27]:
def log_mse_like_counts(y_true, y_pred):
  """
  Calculate the Log Mean Squared Error (Log MSE) for like counts (log(like_count + 1)).

  Parameters:
  - y_true: array-like, actual like counts
  - y_pred: array-like, predicted like counts

  Returns:
  - log_mse: float, Log Mean Squared Error
  """
  # Ensure inputs are numpy arrays
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)

  # Log transformation: log(like_count + 1)
  log_y_true = np.log1p(y_true)
  log_y_pred = np.log1p(y_pred)

  # Compute squared errors
  squared_errors = (log_y_true - log_y_pred) ** 2

  # Return the mean of squared errors
  return np.mean(squared_errors)

In [28]:
#@title Train Dataset evaluation

y_like_count_train_true = []
y_like_count_train_pred = []
for uname, posts in username2posts_train.items():
  for post in posts:
    pred_val = predict_like_count(uname, post)
    true_val = post.get("like_count", 0)
    if true_val is None:
      true_val = 0

    y_like_count_train_true.append(true_val)
    y_like_count_train_pred.append(pred_val)

print(f"Log MSE Train= {log_mse_like_counts(y_like_count_train_true, y_like_count_train_pred)}")

Log MSE Train= 1.2271047744059362


In [29]:
#@title Test Dataset

path = "/content/test-regression-round1.jsonl"
output_path = "/content/test-regression-round1output.jsonl"

to_predict_like_counts_usernames = []
output_list = []
with open(path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)

    # let's predict
    pred_val = predict_like_count(sample["username"])
    sample["like_count"] = int(pred_val)
    output_list.append(sample)

with open(output_path, "wt") as of:
  json.dump(output_list, of)

In [30]:
# output_list first 3 items
pprint(output_list[:3])

[{'caption': 'KOZA 2023 2.si Damla’nın koleksiyonu, Latincede ‘Memento Mori’ '
             'olarak bilinen ‘ölümlü olduğunu hatırla’ anlamındaki ifadeden '
             'esinleniyor. Koleksiyon, hayatın ve ölümün, para, işçi, kral ve '
             'kraliçe kavramları üzerinden yaratıcı görünümlerle bir araya '
             'getirilmesini amaçlıyor. Ölüm sembollerinden esinlenen desenler '
             'kullanan Damla, “kağıt parçasından ibaret olmak” kavramını '
             'vurguluyor. Koleksiyon, yaşamın ve ölümün aynı anda ifade '
             'edilmesini hedefliyor; kırmızı ve mavi ışıklarla veya '
             'gözlüklerle görülen hologram efekti kullanılarak bu konsept '
             'sahneye taşınıyor. Kırmızı renk ölümü, mavi ise yaşamı '
             'simgeliyor. Koleksiyon, ofis giyimlerinden esinlenerek '
             'kravatlar, gömlekler ve evrak çantaları içeriyor. Klasik sivri '
             'burun çizmelerin üzerine spor ayakkabıların üst yüzeyi '
             'yerle

In [31]:
import pandas as pd

# Assuming this function is used to calculate the average like count for a user
def get_average_like_count(posts):
    # Get the like_count of each post, ensuring it defaults to 0 if like_count is missing
    like_counts = [p['like_count'] if p.get('like_count') is not None else 0 for p in posts]
    # Return the average of like counts
    return sum(like_counts) / len(like_counts) if like_counts else 0

# Initialize an empty list to store the post data
post_train = []

# Process the training dataset
for uname, posts in username2posts_train.items():

    # Calculate the average like count for the user
    average_like_count = get_average_like_count(posts)

    # Iterate through each post for the user
    for p in posts:
        post = {}
        post['post_id'] = p['id']
        post['username'] = uname
        post['comments_count'] = p['comments_count']
        post['media_type'] = p['media_type']

        # If like_count exists, use it; otherwise, default to 0
        post['like_count'] = p['like_count'] if p.get('like_count') is not None else 0

        # Add the average like count of the user
        post['average_like_count_of_user'] = average_like_count

        # If caption exists, add it; otherwise, it will be missing in the DataFrame
        if p.get('caption') is not None:
            post['caption'] = p['caption']

        # Append the post data to the list
        post_train.append(post)

# Process the test dataset
for uname, posts in username2posts_test.items():
    # Check if the username is already in the training data, avoiding duplicates
    if uname not in [p['username'] for p in post_train]:

        # Calculate the average like count for the user in the test set
        average_like_count = get_average_like_count(posts)

        for p in posts:
            post = {}
            post['post_id'] = p['id']
            post['username'] = uname
            post['comments_count'] = p['comments_count']
            post['media_type'] = p['media_type']

            # Handle like_count similarly as before
            post['like_count'] = p['like_count'] if p.get('like_count') is not None else 0
            post['average_like_count_of_user'] = average_like_count

            # Add caption if exists
            if p.get('caption') is not None:
                post['caption'] = p['caption']

            # Append the test post data
            post_train.append(post)

# Convert the list of post data into a DataFrame
post_train_df = pd.DataFrame(post_train)

# Check the first few rows of the DataFrame
print(post_train_df.head())


             post_id    username  comments_count media_type  like_count  \
0  17990918969458720  deparmedya               0      IMAGE         6.0   
1  18219250732221045  deparmedya               1      VIDEO        22.0   
2  18311380465102328  deparmedya               0      VIDEO        19.0   
3  18089518138361507  deparmedya               1      VIDEO        19.0   
4  18012743929758497  deparmedya               0      VIDEO        21.0   

   average_like_count_of_user  \
0                   11.542857   
1                   11.542857   
2                   11.542857   
3                   11.542857   
4                   11.542857   

                                             caption  
0          Cumhuriyetimizin 100.yılı kutlu olsun♾️🇹🇷  
1  Oriflame Duologi Lansmanı #isveçtengelengüzell...  
2                     #oriflameilesaçbakımdevrimi ✌️  
3  ✌️#oriflameilesaçbakımdevrimi 07Agustos’23 ori...  
4            07 Agustos’23 #oriflameturkiye #duoloji  


In [56]:
from sklearn.model_selection import train_test_split #exponential mse
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

# Preprocess and encode the data
label_encoder = LabelEncoder()
post_train_df['media_type'] = label_encoder.fit_transform(post_train_df['media_type'])

# TfidfVectorizer with sparse matrix
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Reduced max_features
X_caption = vectorizer.fit_transform(post_train_df['caption'].fillna(''))

# Combine numerical and text features
X_numerical = post_train_df[['comments_count', 'like_count', 'average_like_count_of_user']].values
X_combined = np.hstack([X_numerical, X_caption.toarray()])

# Define the target variable
y = post_train_df['like_count'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train the model using Linear Regression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict and evaluate
y_pred = regressor.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error (RMSE): {rmse}')


Mean Absolute Error (MAE): 4.5159111110304586e-11
Root Mean Squared Error (RMSE): 2.5684131169854043e-10


In [57]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load and prepare the training data
# Assuming post_train_df is already defined
label_encoder = LabelEncoder()
post_train_df['media_type'] = label_encoder.fit_transform(post_train_df['media_type'])

# TfidfVectorizer for text data
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Reduce max_features if needed
X_caption = vectorizer.fit_transform(post_train_df['caption'].fillna(''))

# Combine numerical and text features
X_numerical = post_train_df[['comments_count', 'like_count', 'average_like_count_of_user']].values
X_combined = np.hstack([X_numerical, X_caption.toarray()])

# Define the target variable
y = post_train_df['like_count'].values

# Apply log transformation to the target variable (y)
y_log = np.log1p(y)  # log1p is used to handle zero values in the target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_log, test_size=0.2, random_state=42)

# Scale the features (optional but can improve model performance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model using Linear Regression
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_log = regressor.predict(X_test_scaled)

# Reverse the log transformation to get the original scale predictions
y_pred = np.expm1(y_pred_log)  # Apply expm1 to reverse log1p

# Calculate evaluation metrics
mae = mean_absolute_error(np.expm1(y_test), y_pred)  # Reverse log transformation for true values as well
rmse = np.sqrt(mean_squared_error(np.expm1(y_test), y_pred))

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


Mean Absolute Error (MAE): 2.6715201980721656e+16
Root Mean Squared Error (RMSE): 2.3599139574790994e+18


In [58]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

# Load the test data
test_data_path = "/content/test-regression-round1.jsonl"
test_data = []

# Read the test data from JSONL file
with open(test_data_path, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))

# Prepare the test dataset
test_post_data = []

for sample in test_data:
    username = sample.get('username', '')  # Safely get 'username'
    caption = sample.get('caption', '')  # Safely get 'caption'
    comments_count = sample.get('comments_count', 0)  # Safely get 'comments_count'
    media_type = sample.get('media_type', '')  # Safely get 'media_type'
    post_id = sample.get('id', '')  # Safely get 'id'

    # Prepare each post as a dictionary
    post_data = {
        'post_id': post_id,
        'username': username,
        'comments_count': comments_count,
        'media_type': media_type,
        'caption': caption  # Add the caption for text-based features
    }

    test_post_data.append(post_data)

test_post_df = pd.DataFrame(test_post_data)

# Step 1: Transform 'media_type' using OrdinalEncoder with handle_unknown='use_encoded_value'
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Fit on the training data 'media_type' column (this step assumes you have already fitted the encoder on the training data)
# Fit it using the training set (post_train_df['media_type']) for a proper encoding
ordinal_encoder.fit(post_train_df[['media_type']])

# Apply the transformation to the test set
test_post_df['media_type'] = ordinal_encoder.transform(test_post_df[['media_type']])

# Step 2: Vectorizing the captions using the same TfidfVectorizer used for training
X_test_caption = vectorizer.transform(test_post_df['caption'].fillna(''))

# Combine numerical features (make sure the shape matches the training data)
X_test_numerical = test_post_df[['comments_count', 'media_type']].values
X_test_combined = np.hstack([X_test_numerical, X_test_caption.toarray()])

# Step 3: Ensure the number of features in X_test_combined matches X_train
# The number of features may differ if the training and test sets contain different words.
# Make sure both training and test data have the same number of features.
if X_test_combined.shape[1] != X_train.shape[1]:
    missing_columns = X_train.shape[1] - X_test_combined.shape[1]
    if missing_columns > 0:
        # Add missing columns (set to zero) to align with training data
        X_test_combined = np.hstack([X_test_combined, np.zeros((X_test_combined.shape[0], missing_columns))])

# Step 4: Scale the features using the same scaler fitted on training data
X_test_scaled = scaler.transform(X_test_combined)

# Step 5: Make predictions using the trained model
y_pred_log_test = regressor.predict(X_test_scaled)

# Reverse the log transformation to get the original scale predictions
y_pred_test = np.expm1(y_pred_log_test)  # Apply expm1 to reverse log1p

y_pred_test_int = np.round(y_pred_test).astype(int)  # Round and convert to integer

# Step 6: Save the integer predictions to a JSON file
predictions = {}
for i, post in enumerate(test_post_data):
    predictions[post['post_id']] = int(y_pred_test_int[i])  # Convert NumPy int64 to Python int

# Save the predictions to a file
prediction_output_path = "/content/new-prediction-regression-round1-integer.json"
with open(prediction_output_path, 'w') as f:
    json.dump(predictions, f, indent=4)

print("Predictions saved to new-prediction-regression-round1-integer.json")
from sklearn.metrics import mean_squared_error, r2_score

# Step 5: Round the predictions to the nearest integer and convert to Python's int
y_pred_test_int = np.round(y_pred_test).astype(int)  # Round and convert to integer



Predictions saved to new-prediction-regression-round1-integer.json
