In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

In [8]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Load data
df = pd.read_pickle("untracked_data/data_prepro_train_01.pkl")

# --- Manual n-gram toggle (unchanged policy) ---
ngram_range = (1, 2)   # set to (1, 2) when you want uni+bigrams

# 10-fold stratified CV (reproducible)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Baseline model: CountVectorizer + single Decision Tree
baseline_tree = make_pipeline(
    CountVectorizer(ngram_range=ngram_range),
    DecisionTreeClassifier(random_state=42)   # default tree; non-linear baseline
)

# Evaluate
scores = cross_val_score(
    baseline_tree,
    df['text'],
    df['label'],
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

print(f"Baseline Decision Tree accuracy (10-fold CV): {scores.mean():.4f} ")


Baseline Decision Tree accuracy (10-fold CV): 0.6734 


In [9]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Load data
df = pd.read_pickle("untracked_data/data_prepro_train_01.pkl")

# --- Manual n-gram toggle (you control this) ---
ngram_range = (1, 1)   # set to (1, 2) when you want uni+bigrams

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=ngram_range)),
    ('clf', DecisionTreeClassifier(random_state=42))
])

# What to fluctuate:
# - vectorizer__min_df → sparse-term removal (as requested)
# - key tree hyperparams: depth, leaves, feature subsampling, pruning
param_grid = {
    'vectorizer__min_df': [1, 2, 5, 10],        # sparse-term removal
    'clf__criterion': ['gini', 'entropy'],      # split quality
    'clf__max_depth': [None, 20, 40],           # control depth/complexity
    'clf__min_samples_leaf': [1, 2, 5],         # avoid tiny leaves (overfit)
    'clf__max_features': [None, 'sqrt', 'log2'],# feature subsampling at split
    'clf__min_samples_split': [2, 10],          # split robustness
    'clf__ccp_alpha': [0.0, 1e-4, 1e-3]         # post-pruning via cost-complexity
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid.fit(df['text'], df['label'])

print("Best params:", grid.best_params_)
print(f"Best mean CV accuracy: {grid.best_score_:.4f}")


Fitting 10 folds for each of 1296 candidates, totalling 12960 fits
[CV] END clf__ccp_alpha=0.0, clf__criterion=gini, clf__max_depth=None, clf__max_features=None, clf__min_samples_leaf=1, clf__min_samples_split=2, vectorizer__min_df=1; total time=   0.4s
[CV] END clf__ccp_alpha=0.0, clf__criterion=gini, clf__max_depth=None, clf__max_features=None, clf__min_samples_leaf=1, clf__min_samples_split=2, vectorizer__min_df=1; total time=   0.5s
[CV] END clf__ccp_alpha=0.0, clf__criterion=gini, clf__max_depth=None, clf__max_features=None, clf__min_samples_leaf=1, clf__min_samples_split=2, vectorizer__min_df=1; total time=   0.5s
[CV] END clf__ccp_alpha=0.0, clf__criterion=gini, clf__max_depth=None, clf__max_features=None, clf__min_samples_leaf=1, clf__min_samples_split=2, vectorizer__min_df=1; total time=   0.5s
[CV] END clf__ccp_alpha=0.0, clf__criterion=gini, clf__max_depth=None, clf__max_features=None, clf__min_samples_leaf=1, clf__min_samples_split=2, vectorizer__min_df=1; total time=   0.5

KeyboardInterrupt: 