## Question 3 Kaggle Competition

In [6]:
import numpy as np, pandas as pd
import emoji
from scipy import sparse
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

## 1. Load Data & Drop Columns

In [7]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

columns_drop = [
    'id',
    'created_at',
    'profile_background_image_url',
    'profile_image_url',
]

train = train.drop(columns=columns_drop, errors='ignore')
test = test.drop(columns=columns_drop, errors='ignore')

## 2. Data Engineeering

In [8]:
# Emoji
def count_emojis(text):
    return sum(1 for char in str(text) if emoji.is_emoji(char))

def has_emoji(text):
    return int(any(emoji.is_emoji(char) for char in str(text)))

# Numeric features
def numeric_block(df):
    num = df[['followers_count',
              'friends_count',
              'favourites_count',        
              'statuses_count',
              'average_tweets_per_day',
              'account_age_days']].copy()

    num['followers_to_friends']   = df['followers_count'] / (df['friends_count'] + 1)
    num['favourites_per_status']  = df['favourites_count'] / (df['statuses_count'] + 1)
    num['tweets_per_day']         = df['statuses_count'] / (df['account_age_days'] + 1)

    for col in ['default_profile', 'default_profile_image', 'geo_enabled', 'verified']:
        num[col] = df[col].astype(int)

    return num

In [9]:
# Screen name
def sn_feats(series):
    f = pd.DataFrame(index=series.index)
    f['sn_len']           = series.str.len()
    f['sn_digits']        = series.str.count(r'\d')
    f['sn_digit_ratio']   = f['sn_digits'] / f['sn_len'].clip(lower=1)
    f['sn_ends_year']     = series.str.contains(r'\d{4}$').astype(int)
    f['sn_has_underscore']= series.str.contains('_').astype(int)
    return f.astype(np.float32)

# Fill NA
for col in ['screen_name', 'description', 'location', 'lang']:
    train[col] = train[col].fillna('')
    test[col]  = test[col].fillna('')

# Num features
X_num = numeric_block(train)
X_num_test = numeric_block(test)

# Text & Emoji
for df, store in [(train, X_num), (test, X_num_test)]:
    store['desc_len']         = df['description'].str.len()
    store['desc_has_url']     = df['description'].str.contains('http').astype(int)
    store['loc_len']          = df['location'].str.len()
    store['desc_has_emoji']   = df['description'].apply(has_emoji)
    store['desc_emoji_count'] = df['description'].apply(count_emojis)

In [10]:
# Align column names
num_cols = X_num.columns.tolist()
X_num_test = X_num_test.reindex(columns=num_cols, fill_value=0)

# Language one hot
COMMON = train['lang'].value_counts().index[:15]
train['lang'] = train['lang'].where(train['lang'].isin(COMMON), 'other')
test['lang']  = test['lang'].where(test['lang'].isin(COMMON), 'other')

ohe = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
lang_train = ohe.fit_transform(train[['lang']])
lang_test  = ohe.transform(test[['lang']])

# Screen name 
X_sn_train = sn_feats(train['screen_name'])
X_sn_test  = sn_feats(test['screen_name'])

# TF IDF
tfidf_desc = TfidfVectorizer(max_features=8000, stop_words='english')
tfidf_loc  = TfidfVectorizer(max_features=2000, stop_words='english')

desc_train = tfidf_desc.fit_transform(train['description'])
desc_test  = tfidf_desc.transform(test['description'])

loc_train = tfidf_loc.fit_transform(train['location'])
loc_test  = tfidf_loc.transform(test['location'])

### 2.1 Converting Sparse

In [11]:
X_num_sp = sparse.csr_matrix(X_num.values.astype(np.float32))
X_num_sp_test = sparse.csr_matrix(X_num_test.values.astype(np.float32))

X_sn_sp = sparse.csr_matrix(X_sn_train.values.astype(np.float32))
X_sn_sp_test = sparse.csr_matrix(X_sn_test.values.astype(np.float32))

# ====== Final merged sparse matrices ======
X_full = sparse.hstack([X_num_sp, lang_train, X_sn_sp, desc_train, loc_train]).tocsr()
X_test_f = sparse.hstack([X_num_sp_test, lang_test, X_sn_sp_test, desc_test, loc_test]).tocsr()

print("Train matrix shape:", X_full.shape)
print("Test matrix shape:", X_test_f.shape)

# matrix shape: (num of samples, features)

Train matrix shape: (26206, 10039)
Test matrix shape: (11232, 10039)


## 3. Splitting

In [12]:
X_train = train.drop("target", axis=1)
y_train = train["target"] 

In [13]:
# Train-validation split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_full, y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=2025
)

In [14]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# Define the class weights to handle imbalance
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# parameters
params = {
    'n_estimators': 400,
    'learning_rate': 0.0137,
    'max_depth': 9,
    'min_child_weight': 5.022,
    'subsample': 0.6271,
    'colsample_bytree': 0.577,
    'gamma': 0.6022,
    'reg_lambda': 0.01326,
    'reg_alpha': 0.01161,
    'scale_pos_weight': pos_weight,
    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': 2025
}


# Initialize and train model
final_model = XGBClassifier(**params)
final_model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],  # For AUC monitoring (won’t early stop)
    verbose=False
)

# Predict on test set
y_pred = final_model.predict(X_test_f)


In [15]:
# ---- probability of class 1 (bot) ----
y_prob = final_model.predict_proba(X_test_f)[:, 1]   # float in [0,1]

submission = pd.DataFrame({
    "index": range(len(y_prob)),
    "target": y_prob          # <-- probabilities, not labels
})
submission.to_csv("XGBFinal_testingveeer400.csv", index=False)
print("submission.csv with probabilities saved ✅")


submission.csv with probabilities saved ✅


In [16]:
from sklearn.metrics import roc_auc_score

print("Train AUC:", roc_auc_score(y_tr, final_model.predict_proba(X_tr)[:, 1]))
print("Val AUC:", roc_auc_score(y_val, final_model.predict_proba(X_val)[:, 1]))


Train AUC: 0.9769469451493473
Val AUC: 0.9478142359849393


## Conclusion
#### 1. Data Engineering
- Emoji features: counts number, flag presence of least 1 emoji
- Numeric features: basic Twitter stats and derived some into ratio and rates, converted boolean flags to integers
- Screen Name features: length, digit count, ratio, underscores and if end with 4-digit year
- Missing text fields: replace nulls with empty str
- Text base features: description etc lengths, presence of url, emoji
- Language encoding: OneHotEncoder for categorical representation, top 15 langs
- TF-IDF vetorization
#### 2. Model Selection 
- Choose XGBoostClassifier as model due to its:
1. high predictive power on mixed num, cat and sparse data
2. handles sparse and high dimensional features since we data engineered quite a bit
3. feature selection via tree splits, reduce impact of noise
4. can tune hyperparameter eg. max_depth to prevent overfitting (see test cases below)
5. good in Kaggle competitions :D
#### 3. Notes
- The two models submitted on Kaggle are both based on XGBClassifiers from this code, but tweaked slightly different
- The first version had a higher score on Kaggle but after checking seems to be overfitting when comparing Train AUC and Val AUC
- After few tweaks of hyperparams it seems a bit better however the score on Kaggle had lowered ever so slightly, but lower chance of overfit

In [17]:
# 1. depth 9, 1000 estimators seems to be overfitting
#Train AUC: 0.984812218087364
# Val AUC: 0.9500941101345795

In [18]:
# 2. depth 9, 500 estimators better
# Train AUC: 0.9790576558862584
# Val AUC: 0.9487161417866146


In [19]:
# 3. depth 9, 300 estimators underfitting
# Train AUC: 0.975604714334724
# Val AUC: 0.9472339786171974

In [20]:
# 4. depth 9, 400 estimators sweet spot??
# Train AUC: 0.9776614672754542
# Val AUC: 0.9481287802265012