In [8]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

In [15]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Load data
df = pd.read_pickle("untracked_data/data_prepro_train_01.pkl")

# --- Manual n-gram toggle (unchanged policy) ---
ngram_range = (1, 2)   # set to (1, 2) when you want uni+bigrams

# 10-fold stratified CV (reproducible)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Baseline model: CountVectorizer + single Decision Tree
baseline_tree = make_pipeline(
    CountVectorizer(ngram_range=ngram_range),
    DecisionTreeClassifier(random_state=42)   # default tree; non-linear baseline
)

# Evaluate
scores = cross_val_score(
    baseline_tree,
    df['text'],
    df['label'],
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

print(f"Baseline Single Decision Tree accuracy (10-fold CV): {scores.mean():.4f} ")


Baseline Single Decision Tree accuracy (10-fold CV): 0.6734 


In [16]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Load data
df = pd.read_pickle("untracked_data/data_prepro_train_01.pkl")

# --- Manual n-gram toggle (you control this) ---
ngram_range = (1, 1)   # set to (1, 2) when you want uni+bigrams

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=ngram_range)),
    ('clf', DecisionTreeClassifier(random_state=42))
])

param_grid = {
    'vectorizer__min_df': [1, 2, 5, 10],      # sparse-term removal
    'clf__max_depth': [None, 10, 20, 40],     # control tree depth
    'clf__min_samples_leaf': [1, 2, 5],       # prevent tiny leaves
    'clf__ccp_alpha': [0.0, 1e-5, 1e-4, 1e-3] # post-pruning strength
}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid.fit(df['text'], df['label'])

print("Best params:", grid.best_params_)
print(f"Best mean CV accuracy: {grid.best_score_:.4f}")


Fitting 10 folds for each of 192 candidates, totalling 1920 fits
[CV] END clf__ccp_alpha=0.0, clf__max_depth=None, clf__min_samples_leaf=1, vectorizer__min_df=1; total time=   0.1s
[CV] END clf__ccp_alpha=0.0, clf__max_depth=None, clf__min_samples_leaf=1, vectorizer__min_df=1; total time=   0.1s
[CV] END clf__ccp_alpha=0.0, clf__max_depth=None, clf__min_samples_leaf=1, vectorizer__min_df=1; total time=   0.1s
[CV] END clf__ccp_alpha=0.0, clf__max_depth=None, clf__min_samples_leaf=1, vectorizer__min_df=1; total time=   0.1s
[CV] END clf__ccp_alpha=0.0, clf__max_depth=None, clf__min_samples_leaf=1, vectorizer__min_df=1; total time=   0.1s
[CV] END clf__ccp_alpha=0.0, clf__max_depth=None, clf__min_samples_leaf=1, vectorizer__min_df=1; total time=   0.1s
[CV] END clf__ccp_alpha=0.0, clf__max_depth=None, clf__min_samples_leaf=1, vectorizer__min_df=1; total time=   0.1s
[CV] END clf__ccp_alpha=0.0, clf__max_depth=None, clf__min_samples_leaf=1, vectorizer__min_df=1; total time=   0.1s
[CV] EN