In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
import os
from src.utils import load_checkpoint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from io import StringIO
import numpy as np

df = load_checkpoint("src/extract/dataframe_checkpoint_20.pickle")

Loading checkpoint from src/extract/dataframe_checkpoint_20.pickle
Loaded 19262 rows and 3 columns.
Loaded 19262 rows and 3 columns.


In [5]:
# features - ast tfidf vectorizer
# model - logistic regression
# Convert x_ast to string using .show()
def ast_to_str(ast_obj):
    buf = StringIO()
    ast_obj.show(buf=buf)
    return buf.getvalue()

df['ast_str'] = df['x_ast'].apply(ast_to_str)

X = df['ast_str']
y = df['target']



In [None]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Prepare input variants
inputs = {
    'ast_str': df['ast_str'],
    'x_string': df['x_string']
}

results = {}

# Vectorize features
vectorizers = {
    ('x_string', 'word'): TfidfVectorizer(analyzer='word', max_features=None, min_df=5, ngram_range=(3, 6)),
    ('x_string', 'char'): TfidfVectorizer(analyzer='char', max_features=None, min_df=5, ngram_range=(3, 6)),
    ('ast_str', 'word'): TfidfVectorizer(analyzer='word', max_features=None, min_df=5, ngram_range=(3, 6)),
}

X_matrices = {}
for (input_name, analyzer), vectorizer in vectorizers.items():
    print(f"Vectorizing {input_name} with {analyzer} analyzer")
    X_matrices[(input_name, analyzer)] = vectorizer.fit_transform(df[input_name])

Vectorizing x_string with word analyzer
Vectorizing x_string with char analyzer
Vectorizing ast_str with word analyzer


In [None]:
X_matrices[('ast_str', 'word')] = vectorizer.fit_transform(df['ast_str'])

In [17]:
# Prepare model variants
models = {
    'LogisticRegression': LogisticRegression(solver='liblinear', max_iter=500, class_weight='balanced'),
    'LinearSVC': LinearSVC(max_iter=500, class_weight='balanced'),
}

results = {}

for (input_name, analyzer), X_mat in X_matrices.items():
    X_train, X_test, y_train, y_test = train_test_split(
        X_mat, y, test_size=0.2, random_state=42, stratify=y
    )
    for model_name, model in models.items():
        print(f"Training {model_name} on {input_name} with {analyzer} analyzer")
        if model_name == "HistGradientBoostingClassifier":
            # Convert sparse to dense for HistGradientBoostingClassifier
            X_train_fit = X_train.toarray()
            X_test_fit = X_test.toarray()
        else:
            X_train_fit = X_train
            X_test_fit = X_test
        try:
            model.fit(X_train_fit, y_train)
            y_pred = model.predict(X_test_fit)
        except Exception as e:
            print(f"Error for {input_name}, {analyzer}, {model_name}: {e}")
            continue
        report = classification_report(y_test, y_pred, output_dict=True)
        results[(input_name, analyzer, model_name)] = report
        print(f"Input: {input_name}, Analyzer: {analyzer}, Model: {model_name}")
        print(classification_report(y_test, y_pred))

Training LogisticRegression on x_string with word analyzer
Input: x_string, Analyzer: word, Model: LogisticRegression
              precision    recall  f1-score   support

           0       0.90      0.84      0.87      3098
           1       0.48      0.63      0.54       755

    accuracy                           0.79      3853
   macro avg       0.69      0.73      0.71      3853
weighted avg       0.82      0.79      0.80      3853

Training LinearSVC on x_string with word analyzer
Input: x_string, Analyzer: word, Model: LinearSVC
              precision    recall  f1-score   support

           0       0.94      0.73      0.82      3098
           1       0.43      0.81      0.56       755

    accuracy                           0.75      3853
   macro avg       0.68      0.77      0.69      3853
weighted avg       0.84      0.75      0.77      3853

Training LogisticRegression on x_string with char analyzer
Input: x_string, Analyzer: char, Model: LogisticRegression
          

In [8]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack, csr_matrix
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score, roc_curve, auc
from sklearn.model_selection import cross_val_predict
import pandas as pd
import pickle

df = load_checkpoint("src/extract/dataframe_checkpoint_20.pickle")

# load external sparse matrix
with open("src/features/matrices/matrix_man-str.pickle", "rb") as f:
    man_str_matrix: csr_matrix = pickle.load(f)
    
def ast_to_str(ast_obj):
    buf = StringIO()
    ast_obj.show(buf=buf)
    return buf.getvalue()

df['ast_str'] = df['x_ast'].apply(ast_to_str)

X = df['ast_str']
y = df['target']
    

Loading checkpoint from src/extract/dataframe_checkpoint_20.pickle
Loaded 19262 rows and 3 columns.
Loaded 19262 rows and 3 columns.


In [9]:
class CombinedFeatures(BaseEstimator, TransformerMixin):
    """Combine TF-IDF of x_string with manual features by weight."""
    def __init__(self, analyzer: str = 'char', ngram_range: tuple = (4, 5), min_df: int = 1, weight: float = 0.1) -> None:
        self.analyzer = analyzer
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.weight = weight

    def fit(self, X: pd.DataFrame, y=None):
        text = X['x_string']
        self.vectorizer = TfidfVectorizer(
            analyzer=self.analyzer,
            ngram_range=self.ngram_range,
            min_df=self.min_df
        )
        self.vectorizer.fit(text)
        return self

    def transform(self, X: pd.DataFrame):
        text = X['x_string']
        tfidf_part = self.vectorizer.transform(text).multiply(1.0 - self.weight)
        idx = X.index.to_numpy()
        man_part = man_str_matrix[idx, :].multiply(self.weight)
        return hstack([tfidf_part, man_part], format="csr")

neg = (y == 0).sum()
pos = (y == 1).sum()
scale_pos_weight = neg / pos

initial_models = {
    'svm': LinearSVC(class_weight='balanced'),
    'logistic': LogisticRegression(solver='liblinear', class_weight='balanced'),
    'decision_tree': DecisionTreeClassifier(class_weight='balanced'),
    'random_forest': RandomForestClassifier(class_weight='balanced'),
    'xgboost': XGBClassifier(use_label_encoder=False,
                              eval_metric='logloss',
                              scale_pos_weight=scale_pos_weight),
}

results_initial = {}
X_df = df[['x_string']]

# for name, clf in initial_models.items():
#     pipe = Pipeline([('combined', CombinedFeatures()), ('clf', clf)])
#     X_tr, X_ts, y_tr, y_ts = train_test_split(
#         X_df, y, test_size=0.2, stratify=y, random_state=42
#     )
#     try:
#         pipe.fit(X_tr, y_tr)
#         preds = pipe.predict(X_ts)
#     except Exception as err:
#         print(f"Error training {name}: {err}")
#         continue
#     rpt = classification_report(y_ts, preds, output_dict=True)
#     results_initial[name] = rpt
#     print(f"\n{name} report:\n", classification_report(y_ts, preds))

In [13]:
search_spaces = {
    'svm': {
        'combined__min_df': Integer(1, 10),
        'combined__weight': Real(0.0, 1.0, prior='uniform'),
        'classifier__C': Real(1e-6, 1e+6, prior='log-uniform'),
        'classifier__loss': Categorical(['hinge', 'squared_hinge']),
    },
    'logistic': {
        'combined__min_df': Integer(1, 10),
        'combined__weight': Real(0.0, 1.0, prior='uniform'),
        'classifier__C': Real(1e-6, 1e+6, prior='log-uniform'),
    },
    'decision_tree': {
        'combined__min_df': Integer(1, 10),
        'combined__weight': Real(0.0, 1.0, prior='uniform'),
        'classifier__max_depth': Integer(1, 50),
        'classifier__min_samples_split': Integer(2, 20),
        'classifier__min_samples_leaf': Integer(1, 20),
        'classifier__criterion': Categorical(['gini', 'entropy']),
    },
    'random_forest': {
        'combined__min_df': Integer(1, 10),
        'combined__weight': Real(0.0, 1.0, prior='uniform'),
        'classifier__max_depth': Integer(1, 50),
        'classifier__min_samples_split': Integer(2, 20),
        'classifier__min_samples_leaf': Integer(1, 20),
        'classifier__max_features': Categorical(['sqrt', 'log2', None]),
    },
    'xgboost': {
        'combined__min_df': Integer(1, 10),
        'combined__weight': Real(0.0, 1.0, prior='uniform'),
        'classifier__n_estimators': Integer(50, 500),
        'classifier__max_depth': Integer(1, 10),
        'classifier__min_child_weight': Integer(1, 10),
        'classifier__gamma': Real(0.0, 5.0),
        'classifier__subsample': Real(0.5, 1.0),
        'classifier__scale_pos_weight': Real(1.0, 10.0, prior='uniform'),
    },
}

models = {
    'svm': LinearSVC(class_weight='balanced'),
    'logistic': LogisticRegression(solver='liblinear', class_weight='balanced'),
    'decision_tree': DecisionTreeClassifier(class_weight='balanced'),
    'random_forest': RandomForestClassifier(class_weight='balanced'),
    'xgboost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
}

results_skopt = {}
i = 0
skip = 3
for name, estimator in models.items():
    if i < skip:
        i += 1
        continue
    pipe = Pipeline([
        ('combined', CombinedFeatures()),
        ('classifier', estimator),
    ])
    search = BayesSearchCV(
        pipe,
        search_spaces[name],
        scoring=make_scorer(f1_score, pos_label=1),
        cv=3,
        n_jobs=-1,
        return_train_score=False,
    )
    print(f"Running BayesSearchCV for {name}")
    try:
        search.fit(df[['x_string']], y)
        
        # Store detailed results in DataFrame
        cv_df = pd.DataFrame(search.cv_results_)
        best_f1 = search.best_score_
        best_params = search.best_params_
        best_pipe = search.best_estimator_
        
        # Prepare result entry
        entry = {
            'cv_results': cv_df,
            'best_f1_score': best_f1,
            'best_params': best_params,
            'best_estimator': best_pipe,
        }
        
        # Compute ROC AUC via cross-validated predictions if possible
        clf = best_pipe.named_steps['classifier']
        if hasattr(clf, 'predict_proba'):
            y_scores = cross_val_predict(best_pipe, df[['x_string']], y, cv=3, method='predict_proba')[:, 1]
        elif hasattr(clf, 'decision_function'):
            y_scores = cross_val_predict(best_pipe, df[['x_string']], y, cv=3, method='decision_function')
        else:
            y_scores = None
            print(f"Warning: Classifier for {name} has no predict_proba or decision_function; skipping ROC AUC.")
        
        if y_scores is not None:
            fpr, tpr, _ = roc_curve(y, y_scores, pos_label=1)
            auc_score = auc(fpr, tpr)
            entry.update({
                'roc_auc_cv': auc_score,
                'fpr_cv': fpr,
                'tpr_cv': tpr,
            })
            print(f"{name} best estimator ROC AUC (CV): {auc_score:.4f}")
        
        results_skopt[name] = entry
        # save to pickle checkpoint
        with open(f"skopt_results_{name}.pickle", "wb") as f:
            pickle.dump(results_skopt, f)
        
        
    except Exception as err:
        print(f"Error in {name}: {err}")

Running BayesSearchCV for random_forest


KeyboardInterrupt: 

In [11]:
# save the results to a pickle file
with open("skopt_results.pickle", "wb") as f:
    pickle.dump(results_skopt, f)