In [95]:
import polars as pl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
import pandas as pd
from sentence_transformers import SentenceTransformer

In [None]:
# Load dataset using pandas
df = pd.read_csv("test_data cs 1.csv")
pd.set_option('future.no_silent_downcasting', True)

# Sample 1000 rows without replacement
df = df.sample(n=100, replace=False, random_state=42)

# Drop 'screen_name' 'text' (not useful for modeling)
df = df.drop(columns=["screen_name", "text"])

# Encode 'verified' (target variable)
df["verified"] = df["verified"].replace({"f": 0, "t": 1})

# Convert 'created_at' to datetime format and extract features
df["created_at"] = pd.to_datetime(df["created_at"], format="%a %b %d %H:%M:%S %Y")
df["hour"] = df["created_at"].dt.hour
df["weekday"] = df["created_at"].dt.weekday
df["month"] = df["created_at"].dt.month
df["year"] = df["created_at"].dt.year
df = df.drop(columns=["created_at"])  # Drop raw datetime column

# Frequency Encoding for categorical variables
categorical_cols = ["user_lang", "lang", "time_zone", "location", "source"]
for col in categorical_cols:
    freq_map = df[col].value_counts(normalize=True).to_dict()
    df[col] = df[col].replace(freq_map)

# Ordinal Encoding for 'label' (eyewitness type)
eyewitness_order = {"don't know": -1, "non-eyewitness": 0, "indirect-eyewitness": 1, "direct-eyewitness": 2}
df["label"] = df["label"].replace(eyewitness_order)


# Standardization of numerical features
num_cols = ["followers_count", "statuses_count", "friends_count", "favourites_count",
            "listed_count", "cred_score", "eye_truth", "compare_text"]
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Print column names
print(df.head)



<bound method NDFrame.head of       user_lang  lang time_zone location verified  friends_count  \
81630       0.3  0.24      0.48     0.54        0      -0.089747   
7201       0.24  0.24      0.48     0.46        0       0.043321   
68170      0.24  0.12      0.52     0.54        0       0.124640   
85236      0.18  0.15      0.48     0.46        0       0.612556   
82466      0.18  0.15      0.52     0.54        0       0.346420   
...         ...   ...       ...      ...      ...            ...   
29321      0.24  0.19      0.48     0.54        0      -0.030606   
18664       0.3   0.3      0.52     0.46        0       0.412954   
7875       0.18   0.3      0.48     0.46        0       0.509059   
40010      0.13  0.24      0.52     0.54        0      -0.082354   
94809      0.13  0.12      0.48     0.54        0       0.095070   

       compare_text source  favourites_count  listed_count  statuses_count  \
81630     -1.714341   0.03         -0.655589     -0.927414       -0.658560 

Train/test split


In [120]:

X = df.drop("verified", axis=1)
y = df["verified"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y, random_state=42)

GBC using all features of the dataset

In [None]:
gbc = GradientBoostingClassifier(random_state=42)

gbc.fit(X_train, y_train)
preds = gbc.predict(X_test)

f1_score_all = round(f1_score(y_test, preds, average='weighted'), 3)
print(f"F1 Score: {f1_score_all}")

ValueError: Classification metrics can't handle a mix of unknown and binary targets

In [74]:
X_train_V = X_train.copy()
print(X_train_V)

    user_lang   lang time_zone location source label  hour  weekday  month  \
548     0.216  0.202      0.49    0.436  0.053     0    12        4      8   
652     0.193   0.31      0.51    0.436  0.036    -1    13        4      8   
43      0.181  0.102      0.51    0.436  0.047    -1    13        4      8   
48      0.193   0.31      0.49    0.564  0.054    -1    13        4      8   
701     0.216  0.102      0.51    0.564  0.054     1    13        4      8   
..        ...    ...       ...      ...    ...   ...   ...      ...    ...   
902     0.193  0.128      0.49    0.436  0.056     0    13        4      8   
938     0.193  0.202      0.49    0.564  0.056    -1    13        4      8   
538     0.181  0.258      0.49    0.564  0.053     1    12        4      8   
111     0.181  0.128      0.51    0.564  0.056    -1    13        4      8   
806     0.216  0.258      0.51    0.436  0.056    -1    13        4      8   

     year  ...  zo8eoi3qjs9s  zxcbwxq9tn  followers_count  stat