In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import uuid

random.seed(42)
np.random.seed(42)

N_USERS = 8000
MIN_TWEETS_PER_USER = 20
MAX_TWEETS_PER_USER = 150

stress_keywords = [
    "stress", "anxiety", "depressed", "depression", "tired", "hopeless", "alone", "worthless",
    "panic", "overwhelmed", "can't cope", "exhausted", "helpless", "sad", "numb", "empty",
    "hurt", "broken", "struggle", "alone", "suicidal thought", "give up"
]

neutral_words = [
    "work", "coffee", "movie", "football", "game", "music", "lecture", "project", "lunch", "travel",
    "happy", "love", "family", "friend", "weather", "tech", "code", "study", "shopping", "birthday"
]

positive_phrases = [
    "Had a great day!", "Feeling good today.", "Loved the movie I watched.", "Excited for the weekend.",
    "Grateful for my friends.", "Learning new things is fun.", "Productive day at work."
]

negative_phrases = [
    "This day has been rough.", "Feeling down.", "Really tired of everything.", "Not my day.",
    "Things are hard right now.", "I can't focus.", "Feeling low and drained."
]

templates = [
    "{prefix} {middle} {suffix}",
    "{middle} {suffix}",
    "{prefix} {middle}",
    "{middle}",
    "{prefix} {suffix}",
    "{suffix}"
]

def rand_date_within(days_back=730):
    """Return an ISO timestamp within the last `days_back` days"""
    now = datetime.utcnow()
    delta = timedelta(days=random.randint(0, days_back), seconds=random.randint(0, 86400))
    return (now - delta).isoformat() + "Z"

def make_tweet_text(is_at_risk):
    """
    Construct a synthetic tweet. Users at risk have higher chance to include stress_keywords and negative phrases.
    """
    parts = []
    stress_prob = 0.35 if is_at_risk else 0.06
    negative_prob = 0.30 if is_at_risk else 0.08
    positive_prob = 0.08 if is_at_risk else 0.25
    neutral_prob = 0.30 if not is_at_risk else 0.20

    if random.random() < 0.4:
        parts.append(random.choice(["FYI", "Update:", "Note:", ""]))
    mid_roll = random.random()
    if mid_roll < stress_prob:
        kw = random.choice(stress_keywords)
        addon = random.choice(negative_phrases) if random.random() < 0.6 else random.choice(neutral_words)
        parts.append(f"{kw} {addon}")
    elif mid_roll < stress_prob + negative_prob:
        parts.append(random.choice(negative_phrases))
    elif mid_roll < stress_prob + negative_prob + positive_prob:
        parts.append(random.choice(positive_phrases))
    else:
        parts.append(random.choice(neutral_words) + " " + random.choice(neutral_words))
    if random.random() < 0.25:
        parts.append(random.choice(["#life", "#mood", "#work", "#tired", ":-(", ":)"]))
    if random.random() < 0.05:
        parts.append(random.choice(["Thanks!", "Totally", "Agreed", "Same here"]))
    text = " ".join([p for p in parts if p]).strip()
    if len(text) > 280:
        text = text[:277] + "..."
    return text

users = []
for uid in range(1, N_USERS+1):
    self_harm_flag = np.random.choice([0,1], p=[0.85, 0.15])
    signup_days_ago = random.randint(30, 3650)
    signup_date = (datetime.utcnow() - timedelta(days=signup_days_ago)).date().isoformat()
    followers = max(0, int(np.random.exponential(scale=50)))
    total_tweets = random.randint(50, 5000)
    users.append({
        "user_id": uid,
        "self_harm_flag": int(self_harm_flag),
        "signup_date": signup_date,
        "followers": followers,
        "total_tweets_estimate": total_tweets
    })

users_df = pd.DataFrame(users)

tweets_records = []
tweet_id_counter = 1

for _, u in users_df.iterrows():
    uid = int(u.user_id)
    is_at_risk = bool(u.self_harm_flag)
    n_tweets = np.random.randint(MIN_TWEETS_PER_USER, MAX_TWEETS_PER_USER+1)
    for i in range(n_tweets):
        text = make_tweet_text(is_at_risk)
        created_at = rand_date_within(days_back=730)
        is_reply = random.random() < 0.12
        is_retweet = random.random() < 0.08
        like_count = int(np.random.poisson(2) + (0 if not is_at_risk else 0))
        retweet_count = int(np.random.poisson(0.5))
        tweets_records.append({
            "tweet_id": tweet_id_counter,
            "user_id": uid,
            "created_at": created_at,
            "text": text,
            "is_reply": int(is_reply),
            "is_retweet": int(is_retweet),
            "like_count": like_count,
            "retweet_count": retweet_count
        })
        tweet_id_counter += 1

tweets_df = pd.DataFrame(tweets_records)

users_path = "synthetic_users_8000.csv"
tweets_path = "synthetic_tweets_8000_users.csv"
users_df.to_csv(users_path, index=False)
tweets_df.to_csv(tweets_path, index=False)

try:
    from ace_tools import display_dataframe_to_user
    display_dataframe_to_user("Sample users (8000)", users_df.sample(5))
    display_dataframe_to_user("Sample tweets", tweets_df.sample(8))
except Exception:
    display = pd.concat([users_df.head(), tweets_df.head()], axis=1)
    print("Saved files:\n", users_path, "\n", tweets_path)
    print("\nUsers sample:\n", users_df.head().to_dict(orient="records"))
    print("\nTweets sample:\n", tweets_df.head().to_dict(orient="records"))

users_path, tweets_path, len(users_df), len(tweets_df)



  signup_date = (datetime.utcnow() - timedelta(days=signup_days_ago)).date().isoformat()
  now = datetime.utcnow()


Saved files:
 synthetic_users_8000.csv 
 synthetic_tweets_8000_users.csv

Users sample:
 [{'user_id': 1, 'self_harm_flag': 0, 'signup_date': '2018-08-30', 'followers': 150, 'total_tweets_estimate': 962}, {'user_id': 2, 'self_harm_flag': 0, 'signup_date': '2025-07-21', 'followers': 45, 'total_tweets_estimate': 2303}, {'user_id': 3, 'self_harm_flag': 0, 'signup_date': '2023-02-01', 'followers': 8, 'total_tweets_estimate': 1878}, {'user_id': 4, 'self_harm_flag': 0, 'signup_date': '2024-04-08', 'followers': 100, 'total_tweets_estimate': 889}, {'user_id': 5, 'self_harm_flag': 0, 'signup_date': '2018-03-31', 'followers': 61, 'total_tweets_estimate': 4517}]

Tweets sample:
 [{'tweet_id': 1, 'user_id': 1, 'created_at': '2024-08-15T23:31:16.781207Z', 'text': 'Grateful for my friends.', 'is_reply': 0, 'is_retweet': 0, 'like_count': 0, 'retweet_count': 0}, {'tweet_id': 2, 'user_id': 1, 'created_at': '2024-09-01T15:24:02.781293Z', 'text': 'study work', 'is_reply': 0, 'is_retweet': 0, 'like_count':

('synthetic_users_8000.csv', 'synthetic_tweets_8000_users.csv', 8000, 678615)

In [None]:
!pip install vaderSentiment --quiet

import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

tweets = pd.read_csv("synthetic_tweets_8000_users.csv")
print("Tweets loaded:", tweets.shape)

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    try:
        return analyzer.polarity_scores(text)["compound"]
    except:
        return 0

tweets["sentiment_score"] = tweets["text"].apply(get_sentiment)

avg_sentiment = (
    tweets.groupby("user_id")["sentiment_score"]
    .mean()
    .reset_index()
)

avg_sentiment.columns = ["user_id", "avg_sentiment"]

avg_sentiment.head()

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hTweets loaded: (678615, 8)


Unnamed: 0,user_id,avg_sentiment
0,1,0.206052
1,2,0.111158
2,3,0.314644
3,4,0.169123
4,5,0.168094


In [None]:
import pandas as pd

tweets = pd.read_csv("synthetic_tweets_8000_users.csv")

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

if "sentiment_score" not in tweets.columns:
    tweets["sentiment_score"] = tweets["text"].apply(
        lambda x: analyzer.polarity_scores(x)["compound"]
    )

tweets["is_negative"] = tweets["sentiment_score"] <= -0.05

neg_tweet_ratio = (
    tweets.groupby("user_id")["is_negative"]
    .mean()
    .reset_index()
)

neg_tweet_ratio.columns = ["user_id", "neg_tweet_ratio"]

neg_tweet_ratio.head()


Unnamed: 0,user_id,neg_tweet_ratio
0,1,0.162791
1,2,0.16
2,3,0.012821
3,4,0.101266
4,5,0.141732


In [None]:
import pandas as pd

tweets = pd.read_csv("synthetic_tweets_8000_users.csv")

stress_keywords = [
    "stress", "tired", "depressed", "anxiety", "worthless",
    "panic", "alone", "sad", "hopeless", "fail", "overwhelmed",
    "empty", "hurt", "struggle", "helpless", "broken"
]

stress_keywords = [w.lower() for w in stress_keywords]

def count_stress_words(text):
    text = str(text).lower()
    return sum(1 for w in stress_keywords if w in text)

tweets["stress_word_count"] = tweets["text"].apply(count_stress_words)

stress_keywords_total = (
    tweets.groupby("user_id")["stress_word_count"]
    .sum()
    .reset_index()
)
stress_keywords_total.columns = ["user_id", "stress_keywords_freq_total"]

tweets["has_stress_word"] = tweets["stress_word_count"] > 0

stress_keywords_tweet_count = (
    tweets.groupby("user_id")["has_stress_word"]
    .sum()
    .reset_index()
)
stress_keywords_tweet_count.columns = ["user_id", "stress_keywords_freq_tweets"]

stress_keywords_total.head(), stress_keywords_tweet_count.head()

(   user_id  stress_keywords_freq_total
 0        1                          11
 1        2                           6
 2        3                           1
 3        4                           6
 4        5                          12,
    user_id  stress_keywords_freq_tweets
 0        1                           10
 1        2                            6
 2        3                            1
 3        4                            6
 4        5                           12)

In [None]:
import pandas as pd
from datetime import timedelta

tweets = pd.read_csv("synthetic_tweets_8000_users.csv")

tweets["created_at"] = pd.to_datetime(tweets["created_at"], errors="coerce")

latest_time = tweets["created_at"].max()

cutoff_date = latest_time - timedelta(days=30)

tweets["is_recent"] = tweets["created_at"] >= cutoff_date

past_month_activity = (
    tweets.groupby("user_id")["is_recent"]
    .sum()
    .reset_index()
)

past_month_activity.columns = ["user_id", "past_month_activity"]

past_month_activity.head()


Unnamed: 0,user_id,past_month_activity
0,1,1
1,2,3
2,3,3
3,4,1
4,5,10


In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

users = pd.read_csv("synthetic_users_8000.csv")
tweets = pd.read_csv("synthetic_tweets_8000_users.csv")

print("Loaded shapes -> users:", users.shape, " tweets:", tweets.shape)

tweets["created_at"] = pd.to_datetime(tweets["created_at"], errors="coerce")

analyzer = SentimentIntensityAnalyzer()

def safe_sentiment(text):
    try:
        return analyzer.polarity_scores(str(text))["compound"]
    except Exception:
        return 0.0

if "sentiment_score" not in tweets.columns:
    tweets["sentiment_score"] = tweets["text"].apply(safe_sentiment)

avg_sentiment = tweets.groupby("user_id")["sentiment_score"].mean().reset_index()
avg_sentiment.columns = ["user_id", "avg_sentiment"]

tweets["is_negative"] = tweets["sentiment_score"] <= -0.05
neg_tweet_ratio = tweets.groupby("user_id")["is_negative"].mean().reset_index()
neg_tweet_ratio.columns = ["user_id", "neg_tweet_ratio"]

stress_keywords = [
    "stress", "tired", "depressed", "depression", "anxiety", "worthless",
    "panic", "alone", "sad", "hopeless", "fail", "overwhelmed",
    "empty", "hurt", "struggle", "helpless", "broken", "give up"
]
stress_keywords = [w.lower() for w in stress_keywords]

def count_stress_words(text):
    t = str(text).lower()
    return sum(t.count(w) for w in stress_keywords)

tweets["stress_word_count"] = tweets["text"].apply(count_stress_words)
tweets["has_stress_word"] = tweets["stress_word_count"] > 0

stress_keywords_total = tweets.groupby("user_id")["stress_word_count"].sum().reset_index()
stress_keywords_total.columns = ["user_id", "stress_keywords_freq_total"]

stress_keywords_tweetcount = tweets.groupby("user_id")["has_stress_word"].sum().reset_index()
stress_keywords_tweetcount.columns = ["user_id", "stress_keywords_freq_tweets"]

latest_time = tweets["created_at"].max()
cutoff_date = latest_time - timedelta(days=30)
tweets["is_recent"] = tweets["created_at"] >= cutoff_date

past_month_activity = tweets.groupby("user_id")["is_recent"].sum().reset_index()
past_month_activity.columns = ["user_id", "past_month_activity"]

frames = [
    users,
    avg_sentiment,
    neg_tweet_ratio,
    stress_keywords_total,
    stress_keywords_tweetcount,
    past_month_activity
]

from functools import reduce
final_df = reduce(lambda left, right: pd.merge(left, right, on="user_id", how="left"), frames)

num_missing = final_df.isnull().sum().sum()
final_df["avg_sentiment"] = final_df["avg_sentiment"].fillna(0.0)
final_df["neg_tweet_ratio"] = final_df["neg_tweet_ratio"].fillna(0.0)
final_df["stress_keywords_freq_total"] = final_df["stress_keywords_freq_total"].fillna(0).astype(int)
final_df["stress_keywords_freq_tweets"] = final_df["stress_keywords_freq_tweets"].fillna(0).astype(int)
final_df["past_month_activity"] = final_df["past_month_activity"].fillna(0).astype(int)

print(f"Filled missing values (total missing cells before fill): {num_missing}")

final_df["tweet_activity_ratio"] = final_df["past_month_activity"] / (final_df["total_tweets_estimate"].replace(0, np.nan))
final_df["tweet_activity_ratio"] = final_df["tweet_activity_ratio"].fillna(0.0)

out_path = "/content/final_ml_dataset_8000.csv"
final_df.to_csv(out_path, index=False)
print("Saved final ML dataset to:", out_path)

print("\nFinal dataset shape:", final_df.shape)
print("\nColumns:", final_df.columns.tolist())
print("\nTarget balance (self_harm_flag):")
print(final_df["self_harm_flag"].value_counts(dropna=False))

print("\nSample rows:")
final_df.head()


Loaded shapes -> users: (8000, 5)  tweets: (678615, 8)
Filled missing values (total missing cells before fill): 0
Saved final ML dataset to: /content/final_ml_dataset_8000.csv

Final dataset shape: (8000, 11)

Columns: ['user_id', 'self_harm_flag', 'signup_date', 'followers', 'total_tweets_estimate', 'avg_sentiment', 'neg_tweet_ratio', 'stress_keywords_freq_total', 'stress_keywords_freq_tweets', 'past_month_activity', 'tweet_activity_ratio']

Target balance (self_harm_flag):
self_harm_flag
0    6797
1    1203
Name: count, dtype: int64

Sample rows:


Unnamed: 0,user_id,self_harm_flag,signup_date,followers,total_tweets_estimate,avg_sentiment,neg_tweet_ratio,stress_keywords_freq_total,stress_keywords_freq_tweets,past_month_activity,tweet_activity_ratio
0,1,0,2018-08-30,150,962,0.206052,0.162791,13,11,1,0.00104
1,2,0,2025-07-21,45,2303,0.111158,0.16,6,6,3,0.001303
2,3,0,2023-02-01,8,1878,0.314644,0.012821,2,2,3,0.001597
3,4,0,2024-04-08,100,889,0.169123,0.101266,6,6,1,0.001125
4,5,0,2018-03-31,61,4517,0.168094,0.141732,12,12,10,0.002214


In [None]:
!pip install xgboost --quiet

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

df = pd.read_csv("final_ml_dataset_8000.csv")

print("Dataset shape:", df.shape)
df.head()

feature_cols = [
    "avg_sentiment",
    "neg_tweet_ratio",
    "stress_keywords_freq_total",
    "stress_keywords_freq_tweets",
    "past_month_activity",
    "followers",
    "tweet_activity_ratio"
]

X = df[feature_cols]
y = df["self_harm_flag"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression(max_iter=3000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
lr_acc = accuracy_score(y_test, lr_pred)

print("\n================ LOGISTIC REGRESSION ================")
print("Accuracy:", lr_acc)
print(classification_report(y_test, lr_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_pred))


svm = SVC(kernel="rbf", probability=True)
svm.fit(X_train_scaled, y_train)
svm_pred = svm.predict(X_test_scaled)
svm_acc = accuracy_score(y_test, svm_pred)

print("\n====================== SVM (RBF) =====================")
print("Accuracy:", svm_acc)
print(classification_report(y_test, svm_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, svm_pred))


rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    class_weight="balanced"
)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

print("\n==================== RANDOM FOREST ===================")
print("Accuracy:", rf_acc)
print(classification_report(y_test, rf_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))


xgb_model = xgb.XGBClassifier(
    n_estimators=250,
    max_depth=6,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42
)

xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)

print("\n======================== XGBOOST =====================")
print("Accuracy:", xgb_acc)
print(classification_report(y_test, xgb_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_pred))


print("\n\n================== ACCURACY COMPARISON ==================")
print(f"Logistic Regression: {lr_acc:.4f}")
print(f"SVM (RBF):           {svm_acc:.4f}")
print(f"Random Forest:       {rf_acc:.4f}")
print(f"XGBoost:             {xgb_acc:.4f}")

Dataset shape: (8000, 11)

Accuracy: 0.9965
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1699
           1       0.99      0.99      0.99       301

    accuracy                           1.00      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000

Confusion Matrix:
 [[1696    3]
 [   4  297]]

Accuracy: 0.9965
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1699
           1       0.99      0.99      0.99       301

    accuracy                           1.00      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000

Confusion Matrix:
 [[1696    3]
 [   4  297]]

Accuracy: 0.9955
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1699
           1       0.99      0.98      0.99       301

    accura

In [None]:
!pip install -q xgboost

import os, re, json, math, itertools, random, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

CSV_PATH = "/content/final_ml_dataset_8000.csv"
TARGET = "self_harm_flag"
VAL_SIZE = 0.20
SEED = 42

HIDDEN_SIZES = [512, 256, 128]
DROPOUT_RATE = 0.10
LABEL_SMOOTH = 0.02
MAX_CROSS_PAIRS = 20

BATCH_SIZE = 512
INITIAL_LR = 3e-3
SAVE_DIR = "/content/job"
os.makedirs(SAVE_DIR, exist_ok=True)

EPOCH_BUDGETS = [15, 150, 300]

RESULTS_JSON = os.path.join(SAVE_DIR, "acc_results.json")
ACCURACY_PNG = os.path.join(SAVE_DIR, "accuracy_vs_epochs.png")

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("TF version:", tf.__version__)
print("GPUs:", tf.config.list_physical_devices("GPU"))

from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
print("Mixed precision enabled!")

strategy = tf.distribute.MirroredStrategy()
print("Strategy replicas:", strategy.num_replicas_in_sync)

def is_id_like(n):
    n = str(n).lower().strip()
    return n == "id" or n.endswith("_id") or n.startswith("id_")

def add_datetime_parts(df, target):
    df = df.copy()
    dtcols = []
    for c in df.columns:
        if c == target: continue
        if df[c].dtype == object:
            try:
                df[c] = pd.to_datetime(df[c], errors="raise")
                dtcols.append(c)
            except:
                pass
        elif np.issubdtype(df[c].dtype, np.datetime64):
            dtcols.append(c)
    for c in dtcols:
        df[c+"_year"]  = df[c].dt.year
        df[c+"_month"] = df[c].dt.month
        df[c+"_day"]   = df[c].dt.day
        df[c+"_hour"]  = df[c].dt.hour
        df[c+"_dow"]   = df[c].dt.dayofweek
        df.drop(columns=[c], inplace=True)
    return df

def auto_feature_crosses(Xdf, max_pairs=20):
    nums = Xdf.select_dtypes(include=[np.number]).columns.tolist()
    if len(nums) < 2:
        return Xdf, []
    vari = Xdf[nums].var().sort_values(ascending=False)
    tops = vari.index[:max_pairs+2]
    pairs = list(itertools.combinations(tops, 2))[:max_pairs]
    for a, b in pairs:
        Xdf[f"{a}*{b}"] = Xdf[a]*Xdf[b]
    return Xdf, pairs

def se_block(x, ratio=8, name="se"):
    channels = int(x.shape[-1])
    s = layers.Reshape((1, channels), name=f"{name}_reshape_in")(x)
    s = layers.GlobalAveragePooling1D(name=f"{name}_gap")(s)
    s = layers.Dense(max(1, channels // ratio), activation="relu", name=f"{name}_fc1")(s)
    s = layers.Dense(channels, activation="sigmoid", name=f"{name}_fc2")(s)
    s = layers.Reshape((channels,), name=f"{name}_reshape_out")(s)
    return layers.Multiply(name=f"{name}_scale")([x, s])

def make_cosine_lr_fn(initial_lr, decay_epochs=40, alpha=0.0002):
    def lr_fn(epoch):
        t = min(epoch, decay_epochs)
        cos_val = 0.5 * (1 + math.cos(math.pi * t / decay_epochs))
        lr = initial_lr * (alpha + (1 - alpha) * cos_val)
        return lr
    return lr_fn

print("Loading:", CSV_PATH)
df = pd.read_csv(CSV_PATH, low_memory=False)
print("Loaded shape:", df.shape)

df = df.drop(columns=[c for c in df.columns if is_id_like(c)], errors="ignore")

nunique = df.nunique(dropna=False)
df = df.drop(columns=nunique[nunique <= 1].index.tolist())

df = add_datetime_parts(df, TARGET)

if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in dataset")

y_raw = df[TARGET].values
classes = np.unique(y_raw)
class_map = {str(c): i for i, c in enumerate(classes)}
y = np.array([class_map[str(v)] for v in y_raw], dtype=int)

X = df.drop(columns=[TARGET]).copy()

for c in X.columns:
    if X[c].dtype == object:
        _, inv = np.unique(X[c].astype(str), return_inverse=True)
        X[c] = inv.astype(np.float32)

X, used_pairs = auto_feature_crosses(X, MAX_CROSS_PAIRS)
print("Added feature crosses:", used_pairs)

X_temp, X_test, y_temp, y_test = train_test_split(
    X.values, y, test_size=VAL_SIZE, random_state=SEED, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=VAL_SIZE, random_state=SEED, stratify=y_temp
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.astype(np.float32))
X_val   = scaler.transform(X_val.astype(np.float32))
X_test  = scaler.transform(X_test.astype(np.float32))

input_dim = X_train.shape[1]
num_classes = len(classes)
print("Input dim:", input_dim, "Num classes:", num_classes)

def build_widedeep():
    inputs = keras.Input(shape=(input_dim,))
    wide_logits = layers.Dense(num_classes)(inputs)

    x = inputs
    for i, h in enumerate(HIDDEN_SIZES, 1):
        x = layers.Dense(h, kernel_initializer="he_normal")(x)
        x = layers.Activation("gelu")(x)
        x = layers.Dropout(DROPOUT_RATE)(x)
        x = se_block(x, ratio=8, name=f"se_{i}")

    deep_logits = layers.Dense(num_classes)(x)
    combined = layers.Add()([wide_logits, deep_logits])
    out = layers.Activation("softmax", dtype="float32")(combined)
    return keras.Model(inputs, out)

results = {}

for epochs in EPOCH_BUDGETS:
    print("\n" + "="*80)
    print(f"Training for {epochs} epochs...")

    with strategy.scope():
        model = build_widedeep()
        opt = keras.optimizers.Adam(learning_rate=INITIAL_LR)
        model.compile(
            optimizer=opt,
            loss=keras.losses.CategoricalCrossentropy(label_smoothing=LABEL_SMOOTH),
            metrics=["accuracy"]
        )

    y_train_oh = keras.utils.to_categorical(y_train, num_classes)
    y_val_oh   = keras.utils.to_categorical(y_val, num_classes)

    lr_schedule = make_cosine_lr_fn(INITIAL_LR)
    cbs = [
        callbacks.ModelCheckpoint(
            os.path.join(SAVE_DIR, f"model_{epochs}.keras"),
            save_best_only=True,
            monitor="val_accuracy",
            mode="max",
            verbose=1,
        ),
        callbacks.LearningRateScheduler(lambda ep: lr_schedule(ep), verbose=1)
    ]

    hist = model.fit(
        X_train, y_train_oh,
        validation_data=(X_val, y_val_oh),
        epochs=epochs,
        batch_size=BATCH_SIZE,
        callbacks=cbs,
        verbose=2
    )

    preds = model.predict(X_test).argmax(axis=1)
    acc = accuracy_score(y_test, preds)

    print(f"Test accuracy after {epochs} epochs → {acc:.6f}")
    results[epochs] = float(acc)

with open(RESULTS_JSON, "w") as f:
    json.dump(results, f, indent=2)

plt.figure(figsize=(10,6))
x = sorted(results.keys())
y = [results[k] for k in x]
plt.plot(x, y, marker="o")
for a, b in zip(x, y):
    plt.text(a, b, f"{b:.4f}", ha="center", va="bottom")
plt.title("SE + WideDeep Accuracy vs Epoch Budgets")
plt.xlabel("Epochs")
plt.ylabel("Test Accuracy")
plt.grid(True)
plt.savefig(ACCURACY_PNG, dpi=200)
plt.close()

print("Saved results JSON:", RESULTS_JSON)
print("Saved accuracy plot:", ACCURACY_PNG)

TF version: 2.19.0
GPUs: []
Mixed precision enabled!
Strategy replicas: 1
Loading: /content/final_ml_dataset_8000.csv
Loaded shape: (8000, 11)
Added feature crosses: [('total_tweets_estimate', 'followers'), ('total_tweets_estimate', 'stress_keywords_freq_total'), ('total_tweets_estimate', 'stress_keywords_freq_tweets'), ('total_tweets_estimate', 'signup_date_day'), ('total_tweets_estimate', 'signup_date_month'), ('total_tweets_estimate', 'signup_date_year'), ('total_tweets_estimate', 'past_month_activity'), ('total_tweets_estimate', 'signup_date_dow'), ('total_tweets_estimate', 'neg_tweet_ratio'), ('total_tweets_estimate', 'avg_sentiment'), ('total_tweets_estimate', 'tweet_activity_ratio'), ('total_tweets_estimate', 'signup_date_hour'), ('followers', 'stress_keywords_freq_total'), ('followers', 'stress_keywords_freq_tweets'), ('followers', 'signup_date_day'), ('followers', 'signup_date_month'), ('followers', 'signup_date_year'), ('followers', 'past_month_activity'), ('followers', 'sign

In [None]:
import random, math, time
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import gc

random.seed(42)
np.random.seed(42)

N_USERS = 50000
MIN_TWEETS_PER_USER = 8
MAX_TWEETS_PER_USER = 25
TWEET_WINDOW_DAYS = 730
SELF_HARM_PREVALENCE = 0.12

stress_keywords = [
    "stress", "anxiety", "depressed", "depression", "tired", "hopeless", "alone",
    "worthless", "panic", "overwhelmed", "exhausted", "helpless", "sad", "numb",
    "empty", "hurt", "broken", "struggle", "suicidal", "give up"
]

neutral_words = [
    "work", "coffee", "movie", "football", "game", "music", "lecture", "project",
    "lunch", "travel", "happy", "love", "family", "friend", "weather", "tech",
    "code", "study", "shopping", "birthday"
]

positive_phrases = [
    "Had a great day!", "Feeling good today.", "Loved the movie I watched.",
    "Excited for the weekend.", "Grateful for my friends."
]

negative_phrases = [
    "This day has been rough.", "Feeling down.", "Really tired of everything.",
    "Not my day.", "Things are hard right now."
]

suffixes = ["#life", "#mood", "#work", ":-(", ":)"]

templates = [
    "{middle}", "{prefix} {middle}", "{middle} {suffix}", "{prefix} {middle} {suffix}"
]

def rand_date_within(days_back=TWEET_WINDOW_DAYS):
    now = datetime.utcnow()
    d = random.randint(0, days_back)
    s = random.randint(0, 86399)
    return (now - timedelta(days=d, seconds=s)).isoformat() + "Z"

def make_tweet_text(is_at_risk):
    parts = []
    stress_prob = 0.30 if is_at_risk else 0.06
    negative_prob = 0.28 if is_at_risk else 0.08
    positive_prob = 0.08 if is_at_risk else 0.30

    if random.random() < 0.35:
        prefix = random.choice(["FYI", "Update:", "Note:", ""])
    else:
        prefix = ""
    mid_roll = random.random()
    if mid_roll < stress_prob:
        kw = random.choice(stress_keywords)
        addon = random.choice(negative_phrases) if random.random() < 0.6 else random.choice(neutral_words)
        middle = f"{kw} {addon}"
    elif mid_roll < stress_prob + negative_prob:
        middle = random.choice(negative_phrases)
    elif mid_roll < stress_prob + negative_prob + positive_prob:
        middle = random.choice(positive_phrases)
    else:
        middle = f"{random.choice(neutral_words)} {random.choice(neutral_words)}"
    suffix = random.choice(suffixes) if random.random() < 0.22 else ""
    parts = [p for p in [prefix, middle, suffix] if p]
    text = " ".join(parts).strip()
    return text[:280]

users = []
for uid in range(1, N_USERS+1):
    self_harm_flag = int(np.random.choice([0,1], p=[1-SELF_HARM_PREVALENCE, SELF_HARM_PREVALENCE]))
    signup_days_ago = random.randint(30, 3650)
    signup_date = (datetime.utcnow() - timedelta(days=signup_days_ago)).date().isoformat()
    followers = int(np.random.exponential(scale=40))
    total_tweets_estimate = random.randint(20, 10000)
    users.append({
        "user_id": uid,
        "self_harm_flag": self_harm_flag,
        "signup_date": signup_date,
        "followers": followers,
        "total_tweets_estimate": total_tweets_estimate
    })

users_df = pd.DataFrame(users)
print("Users created:", users_df.shape)

tweets_records = []
tweet_id = 1
total_estimated = 0
for idx, row in users_df.iterrows():
    uid = int(row.user_id)
    is_at_risk = bool(row.self_harm_flag)
    n_tweets = random.randint(MIN_TWEETS_PER_USER, MAX_TWEETS_PER_USER)
    total_estimated += n_tweets
    for _ in range(n_tweets):
        tweets_records.append({
            "tweet_id": tweet_id,
            "user_id": uid,
            "created_at": rand_date_within(),
            "text": make_tweet_text(is_at_risk),
            "is_reply": int(random.random() < 0.12),
            "is_retweet": int(random.random() < 0.08),
            "like_count": int(np.random.poisson(1.5)),
            "retweet_count": int(np.random.poisson(0.4))
        })
        tweet_id += 1
    if (idx+1) % 5000 == 0:
        print(f"Generated tweets for {idx+1} users ...")

tweets_df = pd.DataFrame(tweets_records)
print("Generated tweets:", tweets_df.shape)
del tweets_records
gc.collect()

users_path = "/content/synthetic_users_50000.csv"
tweets_path = "/content/synthetic_tweets_50000_users.csv"
users_df.to_csv(users_path, index=False)
tweets_df.to_csv(tweets_path, index=False)
print("Saved:", users_path, tweets_path)
print("Total users:", len(users_df), "Total tweets:", len(tweets_df))


  signup_date = (datetime.utcnow() - timedelta(days=signup_days_ago)).date().isoformat()


Users created: (50000, 5)


  now = datetime.utcnow()


Generated tweets for 5000 users ...
Generated tweets for 10000 users ...
Generated tweets for 15000 users ...
Generated tweets for 20000 users ...
Generated tweets for 25000 users ...
Generated tweets for 30000 users ...
Generated tweets for 35000 users ...
Generated tweets for 40000 users ...
Generated tweets for 45000 users ...
Generated tweets for 50000 users ...
Generated tweets: (824699, 8)
Saved: /content/synthetic_users_50000.csv /content/synthetic_tweets_50000_users.csv
Total users: 50000 Total tweets: 824699


In [None]:
!pip install -q vaderSentiment

import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import timedelta

users = pd.read_csv("/content/synthetic_users_50000.csv")
tweets = pd.read_csv("/content/synthetic_tweets_50000_users.csv")
print("Loaded users:", users.shape, "tweets:", tweets.shape)

tweets["created_at"] = pd.to_datetime(tweets["created_at"], errors="coerce")

analyzer = SentimentIntensityAnalyzer()

def safe_sentiment(text):
    try:
        return analyzer.polarity_scores(str(text))["compound"]
    except:
        return 0.0

print("Computing sentiment scores...")
tweets["sentiment_score"] = tweets["text"].apply(safe_sentiment)

avg_sentiment = tweets.groupby("user_id")["sentiment_score"].mean().reset_index()
avg_sentiment.columns = ["user_id", "avg_sentiment"]

tweets["is_negative"] = tweets["sentiment_score"] <= -0.05
neg_tweet_ratio = tweets.groupby("user_id")["is_negative"].mean().reset_index()
neg_tweet_ratio.columns = ["user_id", "neg_tweet_ratio"]

stress_keywords = [
    "stress", "tired", "depressed", "anxiety", "worthless",
    "panic", "alone", "sad", "hopeless", "fail", "overwhelmed",
    "empty", "hurt", "struggle", "helpless", "broken", "give up", "suicidal"
]
stress_keywords = [w.lower() for w in stress_keywords]

def count_stress_words(text):
    t = str(text).lower()
    return sum(t.count(w) for w in stress_keywords)

tweets["stress_word_count"] = tweets["text"].apply(count_stress_words)
tweets["has_stress_word"] = tweets["stress_word_count"] > 0

stress_keywords_total = tweets.groupby("user_id")["stress_word_count"].sum().reset_index()
stress_keywords_total.columns = ["user_id", "stress_keywords_freq_total"]

stress_keywords_tweetcount = tweets.groupby("user_id")["has_stress_word"].sum().reset_index()
stress_keywords_tweetcount.columns = ["user_id", "stress_keywords_freq_tweets"]

latest_time = tweets["created_at"].max()
cutoff_date = latest_time - timedelta(days=30)
tweets["is_recent"] = tweets["created_at"] >= cutoff_date
past_month_activity = tweets.groupby("user_id")["is_recent"].sum().reset_index()
past_month_activity.columns = ["user_id", "past_month_activity"]

from functools import reduce
frames = [users, avg_sentiment, neg_tweet_ratio, stress_keywords_total, stress_keywords_tweetcount, past_month_activity]
final_df = reduce(lambda left, right: pd.merge(left, right, on="user_id", how="left"), frames)

final_df["avg_sentiment"] = final_df["avg_sentiment"].fillna(0.0)
final_df["neg_tweet_ratio"] = final_df["neg_tweet_ratio"].fillna(0.0)
final_df["stress_keywords_freq_total"] = final_df["stress_keywords_freq_total"].fillna(0).astype(int)
final_df["stress_keywords_freq_tweets"] = final_df["stress_keywords_freq_tweets"].fillna(0).astype(int)
final_df["past_month_activity"] = final_df["past_month_activity"].fillna(0).astype(int)

final_df["tweet_activity_ratio"] = final_df["past_month_activity"] / final_df["total_tweets_estimate"].replace(0, np.nan)
final_df["tweet_activity_ratio"] = final_df["tweet_activity_ratio"].fillna(0.0)

final_out = "/content/final_ml_dataset_50000.csv"
final_df.to_csv(final_out, index=False)
print("Saved final ML dataset:", final_out)
print("Final shape:", final_df.shape)
final_df.head()

Loaded users: (50000, 5) tweets: (824699, 8)
Computing sentiment scores...
Saved final ML dataset: /content/final_ml_dataset_50000.csv
Final shape: (50000, 11)


Unnamed: 0,user_id,self_harm_flag,signup_date,followers,total_tweets_estimate,avg_sentiment,neg_tweet_ratio,stress_keywords_freq_total,stress_keywords_freq_tweets,past_month_activity,tweet_activity_ratio
0,1,0,2018-08-30,120,1844,0.29905,0.083333,1,1,0,0.0
1,2,0,2025-07-21,36,4526,0.146156,0.24,1,1,0,0.0
2,3,0,2023-02-01,6,3677,0.348593,0.285714,3,3,1,0.000272
3,4,0,2024-04-08,80,1699,0.289161,0.055556,0,0,1,0.000589
4,5,0,2018-03-31,49,8955,0.225477,0.153846,1,1,1,0.000112


In [None]:
!pip install -q xgboost

import os, json, math, itertools, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

CSV_PATH = "/content/final_ml_dataset_50000.csv"
TARGET = "self_harm_flag"
VAL_SIZE = 0.20
SEED = 42

HIDDEN_SIZES = [512, 256, 128]
DROPOUT_RATE = 0.10
LABEL_SMOOTH = 0.02
MAX_CROSS_PAIRS = 20

BATCH_SIZE = 1024
INITIAL_LR = 3e-3
SAVE_DIR = "/content/job"
os.makedirs(SAVE_DIR, exist_ok=True)

EPOCH_BUDGETS = [10, 50, 100]

RESULTS_JSON = os.path.join(SAVE_DIR, "acc_results.json")
ACCURACY_PNG = os.path.join(SAVE_DIR, "accuracy_vs_epochs.png")

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("TF version:", tf.__version__)
print("GPUs:", tf.config.list_physical_devices("GPU"))

from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")
print("Mixed precision enabled!")

strategy = tf.distribute.MirroredStrategy()
print("Strategy replicas:", strategy.num_replicas_in_sync)

def is_id_like(n):
    n = str(n).lower().strip()
    return n == "id" or n.endswith("_id") or n.startswith("id_")

def add_datetime_parts(df, target):
    df = df.copy()
    dtcols = []
    for c in df.columns:
        if c == target:
            continue
        if df[c].dtype == object:
            try:
                df[c] = pd.to_datetime(df[c], errors="raise")
                dtcols.append(c)
            except:
                pass
        elif np.issubdtype(df[c].dtype, np.datetime64):
            dtcols.append(c)
    for c in dtcols:
        df[c+"_year"]  = df[c].dt.year
        df[c+"_month"] = df[c].dt.month
        df[c+"_day"]   = df[c].dt.day
        df[c+"_hour"]  = df[c].dt.hour
        df[c+"_dow"]   = df[c].dt.dayofweek
        df.drop(columns=[c], inplace=True)
    return df

def auto_feature_crosses(Xdf, max_pairs=20):
    nums = Xdf.select_dtypes(include=[np.number]).columns.tolist()
    if len(nums) < 2:
        return Xdf, []
    vari = Xdf[nums].var().sort_values(ascending=False)
    tops = vari.index[:max_pairs+2]
    pairs = list(itertools.combinations(tops, 2))[:max_pairs]
    for a, b in pairs:
        Xdf[f"{a}*{b}"] = Xdf[a]*Xdf[b]
    return Xdf, pairs

def se_block(x, ratio=8, name="se"):
    channels = int(x.shape[-1])
    s = layers.Reshape((1, channels), name=f"{name}_reshape_in")(x)
    s = layers.GlobalAveragePooling1D(name=f"{name}_gap")(s)
    s = layers.Dense(max(1, channels // ratio), activation="relu", name=f"{name}_fc1")(s)
    s = layers.Dense(channels, activation="sigmoid", name=f"{name}_fc2")(s)
    s = layers.Reshape((channels,), name=f"{name}_reshape_out")(s)
    return layers.Multiply(name=f"{name}_scale")([x, s])

def make_cosine_lr_fn(initial_lr, decay_epochs=40, alpha=0.0002):
    def lr_fn(epoch):
        t = min(epoch, decay_epochs)
        cos_val = 0.5 * (1 + math.cos(math.pi * t / decay_epochs))
        lr = initial_lr * (alpha + (1 - alpha) * cos_val)
        return lr
    return lr_fn

print("Loading:", CSV_PATH)
df = pd.read_csv(CSV_PATH, low_memory=False)
print("Loaded shape:", df.shape)

df = df.drop(columns=[c for c in df.columns if is_id_like(c)], errors="ignore")

nunique = df.nunique(dropna=False)
df = df.drop(columns=nunique[nunique <= 1].index.tolist())

df = add_datetime_parts(df, TARGET)

if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found")

y_raw = df[TARGET].values
classes = np.unique(y_raw)
class_map = {str(c): i for i, c in enumerate(classes)}
y = np.array([class_map[str(v)] for v in y_raw], dtype=int)

X = df.drop(columns=[TARGET]).copy()

for c in X.columns:
    if X[c].dtype == object:
        _, inv = np.unique(X[c].astype(str), return_inverse=True)
        X[c] = inv.astype(np.float32)

X, used_pairs = auto_feature_crosses(X, MAX_CROSS_PAIRS)
print("Feature crosses added:", used_pairs)

X_temp, X_test, y_temp, y_test = train_test_split(
    X.values, y, test_size=VAL_SIZE, random_state=SEED, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=VAL_SIZE, random_state=SEED, stratify=y_temp
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.astype(np.float32))
X_val   = scaler.transform(X_val.astype(np.float32))
X_test  = scaler.transform(X_test.astype(np.float32))

input_dim = X_train.shape[1]
num_classes = len(classes)
print("Input dim:", input_dim, "Num classes:", num_classes)

def build_widedeep():
    inputs = keras.Input(shape=(input_dim,))
    wide_logits = layers.Dense(num_classes)(inputs)

    x = inputs
    for i, h in enumerate(HIDDEN_SIZES, 1):
        x = layers.Dense(h, kernel_initializer="he_normal")(x)
        x = layers.Activation("gelu")(x)
        x = layers.Dropout(DROPOUT_RATE)(x)
        x = se_block(x, ratio=8, name=f"se_{i}")

    deep_logits = layers.Dense(num_classes)(x)
    combined = layers.Add()([wide_logits, deep_logits])
    out = layers.Activation("softmax", dtype="float32")(combined)
    return keras.Model(inputs, out)

results = {}
for epochs in EPOCH_BUDGETS:
    print("\n" + "="*80)
    print(f"Training for {epochs} epochs...")

    with strategy.scope():
        model = build_widedeep()
        opt = keras.optimizers.Adam(learning_rate=INITIAL_LR)
        model.compile(
            optimizer=opt,
            loss=keras.losses.CategoricalCrossentropy(label_smoothing=LABEL_SMOOTH),
            metrics=["accuracy"]
        )

    y_train_oh = keras.utils.to_categorical(y_train, num_classes)
    y_val_oh   = keras.utils.to_categorical(y_val, num_classes)

    lr_schedule = make_cosine_lr_fn(INITIAL_LR)
    cbs = [
        callbacks.ModelCheckpoint(
            os.path.join(SAVE_DIR, f"model_{epochs}.keras"),
            save_best_only=True,
            monitor="val_accuracy",
            mode="max",
            verbose=1,
        ),
        callbacks.LearningRateScheduler(lambda ep: lr_schedule(ep), verbose=1)
    ]

    hist = model.fit(
        X_train, y_train_oh,
        validation_data=(X_val, y_val_oh),
        epochs=epochs,
        batch_size=BATCH_SIZE,
        callbacks=cbs,
        verbose=2
    )

    preds = model.predict(X_test).argmax(axis=1)
    acc = accuracy_score(y_test, preds)
    print(f"Test accuracy after {epochs} epochs → {acc:.6f}")
    results[epochs] = float(acc)

with open(RESULTS_JSON, "w") as f:
    json.dump(results, f, indent=2)

plt.figure(figsize=(10,6))
x = sorted(results.keys())
y = [results[k] for k in x]
plt.plot(x, y, marker="o")
for a, b in zip(x, y):
    plt.text(a, b, f"{b:.4f}", ha="center", va="bottom")
plt.title("SE + WideDeep Accuracy vs Epoch Budgets")
plt.xlabel("Epochs")
plt.ylabel("Test Accuracy")
plt.grid(True)
plt.savefig(ACCURACY_PNG, dpi=200)
plt.close()

print("Saved results JSON:", RESULTS_JSON)
print("Saved accuracy plot:", ACCURACY_PNG)

TF version: 2.19.0
GPUs: []
Mixed precision enabled!
Strategy replicas: 1
Loading: /content/final_ml_dataset_50000.csv
Loaded shape: (50000, 11)
Feature crosses added: [('total_tweets_estimate', 'followers'), ('total_tweets_estimate', 'signup_date_day'), ('total_tweets_estimate', 'signup_date_month'), ('total_tweets_estimate', 'signup_date_year'), ('total_tweets_estimate', 'stress_keywords_freq_total'), ('total_tweets_estimate', 'signup_date_dow'), ('total_tweets_estimate', 'stress_keywords_freq_tweets'), ('total_tweets_estimate', 'past_month_activity'), ('total_tweets_estimate', 'avg_sentiment'), ('total_tweets_estimate', 'neg_tweet_ratio'), ('total_tweets_estimate', 'tweet_activity_ratio'), ('total_tweets_estimate', 'signup_date_hour'), ('followers', 'signup_date_day'), ('followers', 'signup_date_month'), ('followers', 'signup_date_year'), ('followers', 'stress_keywords_freq_total'), ('followers', 'signup_date_dow'), ('followers', 'stress_keywords_freq_tweets'), ('followers', 'past_m

In [None]:
import os, json, random, math
import numpy as np
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks, regularizers

LABEL_SMOOTH = 0.0
BATCH_SIZE = 256
INITIAL_LR = 1e-3
L2_REG = 1e-5
ENSEMBLE_RUNS = 3
EARLY_STOPPING_PATIENCE = 12

try:
    from tensorflow.keras import mixed_precision
    mixed_precision.set_global_policy("float32")
    print("Using float32 policy (mixed precision disabled).")
except Exception as e:
    print("Could not change precision policy:", e)

def build_widedeep_with_bn(input_dim, num_classes, hidden_sizes, dropout_rate, l2_reg):
    inputs = keras.Input(shape=(input_dim,))
    wide_logits = layers.Dense(num_classes, kernel_regularizer=regularizers.l2(l2_reg))(inputs)
    x = inputs
    for i, h in enumerate(hidden_sizes, 1):
        x = layers.Dense(h, kernel_initializer="he_normal",
                         kernel_regularizer=regularizers.l2(l2_reg))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation("swish")(x)
        x = layers.Dropout(dropout_rate)(x)
        x = se_block(x, ratio=8, name=f"se_{i}")
    deep_logits = layers.Dense(num_classes, kernel_regularizer=regularizers.l2(l2_reg))(x)
    combined = layers.Add()([wide_logits, deep_logits])
    out = layers.Activation("softmax", dtype="float32")(combined)
    model = keras.Model(inputs, out)
    return model

def train_one_run(seed, epochs):
    print(f"\n--- Training seed={seed} ---")
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    with strategy.scope():
        model = build_widedeep_with_bn(
            input_dim=input_dim,
            num_classes=num_classes,
            hidden_sizes=HIDDEN_SIZES,
            dropout_rate=DROPOUT_RATE,
            l2_reg=L2_REG
        )
        opt = keras.optimizers.Adam(learning_rate=INITIAL_LR)
        model.compile(
            optimizer=opt,
            loss=keras.losses.CategoricalCrossentropy(label_smoothing=LABEL_SMOOTH),
            metrics=["accuracy"]
        )

    ckpt_path = os.path.join(SAVE_DIR, f"best_seed_{seed}.keras")
    cb_list = [
        callbacks.ModelCheckpoint(ckpt_path, save_best_only=True, monitor="val_accuracy", mode="max", verbose=1),
        callbacks.EarlyStopping(monitor="val_accuracy", patience=EARLY_STOPPING_PATIENCE, mode="max", restore_best_weights=True, verbose=1),
        callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=6, verbose=1, min_lr=1e-6)
    ]

    y_train_oh = keras.utils.to_categorical(y_train, num_classes)
    y_val_oh   = keras.utils.to_categorical(y_val, num_classes)

    hist = model.fit(
        X_train, y_train_oh,
        validation_data=(X_val, y_val_oh),
        epochs=epochs,
        batch_size=BATCH_SIZE,
        callbacks=cb_list,
        verbose=2
    )

    try:
        model.load_weights(ckpt_path)
    except Exception:
        pass

    preds_proba = model.predict(X_test, batch_size=1024)
    preds = preds_proba.argmax(axis=1)
    acc = accuracy_score(y_test, preds)
    print(f"Seed {seed} test accuracy: {acc:.6f}")
    return preds_proba, acc

final_results = {}
epochs_to_run = max(EPOCH_BUDGETS)
for epochs in [epochs_to_run]:
    print("\n" + "="*60)
    print(f"Running ensemble for budget {epochs} epochs (each run may early stop)")
    all_preds = []
    accs = []
    for r in range(ENSEMBLE_RUNS):
        seed = SEED + r*7
        preds_proba, acc = train_one_run(seed, epochs)
        all_preds.append(preds_proba)
        accs.append(acc)

    avg_preds = np.mean(np.stack(all_preds, axis=0), axis=0)
    final_preds = avg_preds.argmax(axis=1)
    final_acc = accuracy_score(y_test, final_preds)
    print(f"\nEnsembled test accuracy (avg of {ENSEMBLE_RUNS} runs): {final_acc:.6f}")
    final_results[epochs] = {
        "per_run_accs": accs,
        "ensemble_acc": float(final_acc),
    }

with open(os.path.join(SAVE_DIR, "improved_results.json"), "w") as f:
    json.dump(final_results, f, indent=2)

print("Done. Results saved to", SAVE_DIR)


Using float32 policy (mixed precision disabled).

Running ensemble for budget 100 epochs (each run may early stop)

--- Training seed=42 ---
Epoch 1/100

Epoch 1: val_accuracy improved from -inf to 0.95350, saving model to /content/job/best_seed_42.keras
125/125 - 16s - 132ms/step - accuracy: 0.9376 - loss: 0.2081 - val_accuracy: 0.9535 - val_loss: 0.1795 - learning_rate: 1.0000e-03
Epoch 2/100

Epoch 2: val_accuracy improved from 0.95350 to 0.96613, saving model to /content/job/best_seed_42.keras
125/125 - 9s - 75ms/step - accuracy: 0.9624 - loss: 0.1264 - val_accuracy: 0.9661 - val_loss: 0.1150 - learning_rate: 1.0000e-03
Epoch 3/100

Epoch 3: val_accuracy improved from 0.96613 to 0.96863, saving model to /content/job/best_seed_42.keras
125/125 - 10s - 81ms/step - accuracy: 0.9625 - loss: 0.1221 - val_accuracy: 0.9686 - val_loss: 0.1087 - learning_rate: 1.0000e-03
Epoch 4/100

Epoch 4: val_accuracy did not improve from 0.96863
125/125 - 4s - 28ms/step - accuracy: 0.9625 - loss: 0.120

In [None]:
!pip install -q xgboost

import os, json, math, itertools, random, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks, regularizers

CSV_PATH = "final_ml_dataset_50000.csv"
TARGET = "self_harm_flag"
VAL_SIZE = 0.20
SEED = 42

HIDDEN_SIZES = [512, 256, 128]
DROPOUT_RATE = 0.10

LABEL_SMOOTH = 0.0
BATCH_SIZE = 256
INITIAL_LR = 1e-3
L2_REG = 1e-5
ENSEMBLE_RUNS = 3
EARLY_STOPPING_PATIENCE = 12
USE_MIXED_PRECISION = False
EPOCH_BUDGETS = [15, 150, 300]

SAVE_DIR = "/content/job"
os.makedirs(SAVE_DIR, exist_ok=True)
RESULTS_JSON = os.path.join(SAVE_DIR, "acc_results.json")
ACCURACY_PNG = os.path.join(SAVE_DIR, "accuracy_vs_epochs.png")

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("TF version:", tf.__version__)
print("GPUs:", tf.config.list_physical_devices("GPU"))

from tensorflow.keras import mixed_precision
if USE_MIXED_PRECISION:
    try:
        mixed_precision.set_global_policy("mixed_float16")
        print("Mixed precision enabled.")
    except Exception as e:
        print("Could not enable mixed precision:", e)
else:
    try:
        mixed_precision.set_global_policy("float32")
        print("Using float32 policy (stable numerics).")
    except Exception:
        pass

strategy = tf.distribute.MirroredStrategy()
print("Strategy replicas:", strategy.num_replicas_in_sync)

def is_id_like(n):
    n = str(n).lower().strip()
    return n == "id" or n.endswith("_id") or n.startswith("id_")

def add_datetime_parts(df, target):
    df = df.copy()
    dtcols = []
    for c in df.columns:
        if c == target: continue
        if df[c].dtype == object:
            try:
                df[c] = pd.to_datetime(df[c], errors="raise")
                dtcols.append(c)
            except:
                pass
        elif np.issubdtype(df[c].dtype, np.datetime64):
            dtcols.append(c)
    for c in dtcols:
        df[c+"_year"]  = df[c].dt.year
        df[c+"_month"] = df[c].dt.month
        df[c+"_day"]   = df[c].dt.day
        df[c+"_hour"]  = df[c].dt.hour
        df[c+"_dow"]   = df[c].dt.dayofweek
        df.drop(columns=[c], inplace=True)
    return df

def auto_feature_crosses(Xdf, max_pairs=20):
    nums = Xdf.select_dtypes(include=[np.number]).columns.tolist()
    if len(nums) < 2:
        return Xdf, []
    vari = Xdf[nums].var().sort_values(ascending=False)
    tops = vari.index[:max_pairs+2]
    pairs = list(itertools.combinations(tops, 2))[:max_pairs]
    for a, b in pairs:
        Xdf[f"{a}*{b}"] = Xdf[a]*Xdf[b]
    return Xdf, pairs

def se_block(x, ratio=8, name="se"):
    channels = int(x.shape[-1])
    s = layers.Reshape((1, channels), name=f"{name}_reshape_in")(x)
    s = layers.GlobalAveragePooling1D(name=f"{name}_gap")(s)
    s = layers.Dense(max(1, channels // ratio), activation="relu", name=f"{name}_fc1")(s)
    s = layers.Dense(channels, activation="sigmoid", name=f"{name}_fc2")(s)
    s = layers.Reshape((channels,), name=f"{name}_reshape_out")(s)
    return layers.Multiply(name=f"{name}_scale")([x, s])

def make_cosine_lr_fn(initial_lr, decay_epochs=40, alpha=0.0002):
    def lr_fn(epoch):
        t = min(epoch, decay_epochs)
        cos_val = 0.5 * (1 + math.cos(math.pi * t / decay_epochs))
        lr = initial_lr * (alpha + (1 - alpha) * cos_val)
        return lr
    return lr_fn

print("Loading:", CSV_PATH)
df = pd.read_csv(CSV_PATH, low_memory=False)
print("Loaded shape:", df.shape)

df = df.drop(columns=[c for c in df.columns if is_id_like(c)], errors="ignore")
nunique = df.nunique(dropna=False)
df = df.drop(columns=nunique[nunique <= 1].index.tolist())

df = add_datetime_parts(df, TARGET)

if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in dataset")

y_raw = df[TARGET].values
classes = np.unique(y_raw)
class_map = {str(c): i for i, c in enumerate(classes)}
y = np.array([class_map[str(v)] for v in y_raw], dtype=int)

X = df.drop(columns=[TARGET]).copy()

for c in X.columns:
    if X[c].dtype == object:
        _, inv = np.unique(X[c].astype(str), return_inverse=True)
        X[c] = inv.astype(np.float32)

X, used_pairs = auto_feature_crosses(X, max_pairs=20)
print("Added feature crosses:", used_pairs)

X_temp, X_test, y_temp, y_test = train_test_split(
    X.values, y, test_size=VAL_SIZE, random_state=SEED, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=VAL_SIZE, random_state=SEED, stratify=y_temp
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.astype(np.float32))
X_val   = scaler.transform(X_val.astype(np.float32))
X_test  = scaler.transform(X_test.astype(np.float32))

input_dim = X_train.shape[1]
num_classes = len(classes)
print("Input dim:", input_dim, "Num classes:", num_classes)
print("Train/Val/Test shapes:", X_train.shape, X_val.shape, X_test.shape)

def build_widedeep_with_bn(input_dim, num_classes, hidden_sizes, dropout_rate, l2_reg):
    inputs = keras.Input(shape=(input_dim,))
    wide_logits = layers.Dense(num_classes, kernel_regularizer=regularizers.l2(l2_reg))(inputs)
    x = inputs
    for i, h in enumerate(hidden_sizes, 1):
        x = layers.Dense(h, kernel_initializer="he_normal",
                         kernel_regularizer=regularizers.l2(l2_reg))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation("swish")(x)
        x = layers.Dropout(dropout_rate)(x)
        x = se_block(x, ratio=8, name=f"se_{i}")
    deep_logits = layers.Dense(num_classes, kernel_regularizer=regularizers.l2(l2_reg))(x)
    combined = layers.Add()([wide_logits, deep_logits])
    out = layers.Activation("softmax", dtype="float32")(combined)
    return keras.Model(inputs, out)

def train_one_run(seed, epochs, run_name="run"):
    print(f"\n--- Training seed={seed} ({run_name}) ---")
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    with strategy.scope():
        model = build_widedeep_with_bn(
            input_dim=input_dim,
            num_classes=num_classes,
            hidden_sizes=HIDDEN_SIZES,
            dropout_rate=DROPOUT_RATE,
            l2_reg=L2_REG
        )
        opt = keras.optimizers.Adam(learning_rate=INITIAL_LR)
        model.compile(
            optimizer=opt,
            loss=keras.losses.CategoricalCrossentropy(label_smoothing=LABEL_SMOOTH),
            metrics=["accuracy"]
        )

    y_train_oh = keras.utils.to_categorical(y_train, num_classes)
    y_val_oh   = keras.utils.to_categorical(y_val, num_classes)

    lr_schedule = make_cosine_lr_fn(INITIAL_LR, decay_epochs=40)
    ckpt_path = os.path.join(SAVE_DIR, f"best_{run_name}_seed{seed}.keras")
    cb_list = [
        callbacks.ModelCheckpoint(ckpt_path, save_best_only=True, monitor="val_accuracy", mode="max", verbose=1),
        callbacks.EarlyStopping(monitor="val_accuracy", patience=EARLY_STOPPING_PATIENCE, restore_best_weights=True, verbose=1),
        callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=6, verbose=1, min_lr=1e-6),
        callbacks.LearningRateScheduler(lambda ep: lr_schedule(ep), verbose=0)
    ]

    hist = model.fit(
        X_train, y_train_oh,
        validation_data=(X_val, y_val_oh),
        epochs=epochs,
        batch_size=BATCH_SIZE,
        callbacks=cb_list,
        verbose=2
    )

    try:
        model.load_weights(ckpt_path)
    except Exception:
        pass

    preds_proba = model.predict(X_test, batch_size=1024)
    preds = preds_proba.argmax(axis=1)
    acc = accuracy_score(y_test, preds)
    print(f"Seed {seed} test accuracy: {acc:.6f}")
    return preds_proba, acc, hist

final_results = {}
for epochs in EPOCH_BUDGETS:
    print("\n" + "="*80)
    print(f"Running experiments for budget {epochs} epochs (early stopping may stop sooner)")

    per_run_preds = []
    per_run_accs = []
    per_run_hists = []

    runs = ENSEMBLE_RUNS if ENSEMBLE_RUNS >= 1 else 1
    for r in range(runs):
        seed = SEED + r * 7
        preds_proba, acc, hist = train_one_run(seed, epochs, run_name=f"e{epochs}_r{r}")
        per_run_preds.append(preds_proba)
        per_run_accs.append(acc)
        per_run_hists.append(hist)

    if runs > 1:
        avg_preds = np.mean(np.stack(per_run_preds, axis=0), axis=0)
        final_preds = avg_preds.argmax(axis=1)
        ensemble_acc = accuracy_score(y_test, final_preds)
        print(f"Ensemble (n={runs}) test accuracy: {ensemble_acc:.6f}")
    else:
        ensemble_acc = per_run_accs[0]
        print(f"Single-run test accuracy: {ensemble_acc:.6f}")

    final_results[epochs] = {
        "per_run_accs": [float(a) for a in per_run_accs],
        "ensemble_acc": float(ensemble_acc)
    }

with open(RESULTS_JSON, "w") as f:
    json.dump(final_results, f, indent=2)

plt.figure(figsize=(8,5))
x = sorted(final_results.keys())
y = [final_results[k]["ensemble_acc"] for k in x]
plt.plot(x, y, marker="o")
for a,b in zip(x,y):
    plt.text(a, b, f"{b:.4f}", ha="center", va="bottom")
plt.title("SE + WideDeep (improved) Accuracy vs Epoch Budgets")
plt.xlabel("Epochs")
plt.ylabel("Test Accuracy")
plt.grid(True)
plt.savefig(ACCURACY_PNG, dpi=200)
plt.close()

print("Saved results JSON:", RESULTS_JSON)
print("Saved accuracy plot:", ACCURACY_PNG)
print("Finished. Results saved at:", SAVE_DIR)

TF version: 2.19.0
GPUs: []
Using float32 policy (stable numerics).
Strategy replicas: 1
Loading: final_ml_dataset_50000.csv
Loaded shape: (50000, 11)
Added feature crosses: [('total_tweets_estimate', 'followers'), ('total_tweets_estimate', 'signup_date_day'), ('total_tweets_estimate', 'signup_date_month'), ('total_tweets_estimate', 'signup_date_year'), ('total_tweets_estimate', 'stress_keywords_freq_total'), ('total_tweets_estimate', 'signup_date_dow'), ('total_tweets_estimate', 'stress_keywords_freq_tweets'), ('total_tweets_estimate', 'past_month_activity'), ('total_tweets_estimate', 'avg_sentiment'), ('total_tweets_estimate', 'neg_tweet_ratio'), ('total_tweets_estimate', 'tweet_activity_ratio'), ('total_tweets_estimate', 'signup_date_hour'), ('followers', 'signup_date_day'), ('followers', 'signup_date_month'), ('followers', 'signup_date_year'), ('followers', 'stress_keywords_freq_total'), ('followers', 'signup_date_dow'), ('followers', 'stress_keywords_freq_tweets'), ('followers', '