In [None]:
# Q7

import warnings
warnings.filterwarnings("ignore")

import re, pickle, numpy as np, pandas as pd
from collections import Counter, defaultdict

# 0) Data (NLTK twitter_samples)
import nltk
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples
from nltk.tokenize import TweetTokenizer

RANDOM_STATE = 42
pos_all = twitter_samples.strings('positive_tweets.json')
neg_all = twitter_samples.strings('negative_tweets.json')
X_all = np.array(pos_all + neg_all, dtype=object)
y_all = np.array([1]*len(pos_all) + [0]*len(neg_all), dtype=int)
df = pd.DataFrame({"text": X_all, "y": y_all}).drop_duplicates(subset="text").reset_index(drop=True)
X_all, y_all = df["text"].values, df["y"].values

# 1) Leakage-safe cleaning/tokenization (drop emoticons & obvious sentiment hashtags)
emo_pat = re.compile(r'[:;=8xX][\-^oO\']?[\)\](DdpP/\(\|\\]|<3|:-\(|:-\)|:\)|:\(|;\)|:D|XD', re.UNICODE)
hash_sent_pat = re.compile(r'#(happy|sad|love|hate|blessed|fail|awesome|terrible|good|bad)\b', re.IGNORECASE)
url_pat = re.compile(r'https?://\S+|www\.\S+'); num_pat = re.compile(r'\b\d+\b')
ttok_clean = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

def clean_tokenize(text: str):
    text = url_pat.sub(" URL ", text)
    text = num_pat.sub(" NUM ", text)
    text = re.sub(r'\brt\b', ' ', text)
    text = emo_pat.sub(" ", text)        # remove emoticons
    text = hash_sent_pat.sub(" ", text)  # remove obvious sentiment hashtags
    toks = ttok_clean.tokenize(text)
    return [t for t in toks if any(ch.isalpha() for ch in t)]

# 2) Feature A: 2 class-conditional frequency features with normalization (Q3)
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin

class PNFreqFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, normalize=True):
        self.normalize = normalize
    def fit(self, X, y=None):
        pos_c, neg_c = Counter(), Counter()
        for text, lab in zip(X, y):
            toks = clean_tokenize(text if isinstance(text, str) else str(text))
            (pos_c if lab==1 else neg_c).update(toks)
        self.pos_freq_ = defaultdict(float, {k: float(v) for k,v in pos_c.items()})
        self.neg_freq_ = defaultdict(float, {k: float(v) for k,v in neg_c.items()})
        self.n_train_  = float(len(X))
        return self
    def transform(self, X):
        rows = []
        for text in X:
            toks = clean_tokenize(text if isinstance(text, str) else str(text))
            pos_sum = sum(self.pos_freq_[t] for t in toks)
            neg_sum = sum(self.neg_freq_[t] for t in toks)
            if self.normalize:
                L = max(len(toks), 1)
                N = self.n_train_ * L   # as specified in Q3
                pos_sum /= N; neg_sum /= N
            rows.append([pos_sum, neg_sum])
        return csr_matrix(np.asarray(rows, dtype=float))

# 3) Feature B: 4 stylistic/emphasis features (non-leaky)
class TextExtraFeats(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.neg_words = set([
            "not","no","never","none","cannot","can't","dont","don't","won't","wont",
            "isn't","aint","ain't","hasn't","havent","haven't","wasn't","weren't","nor",
            "n't","shouldn't","couldn't","wouldn't","doesn't","didn't","hadn't"
        ])
        self.ttok_raw = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)
        self.elong_pat = re.compile(r'(.)\1{2,}', re.UNICODE)
    def _one(self, text: str):
        text = text if isinstance(text, str) else str(text)
        exclam = min(text.count('!'), 3) / 3.0
        toks = self.ttok_raw.tokenize(text)
        word_toks = [t for t in toks if any(ch.isalpha() for ch in t)]
        n = max(len(word_toks), 1)
        neg_ratio   = sum(1 for t in word_toks if t.lower() in self.neg_words) / n
        elong_ratio = sum(1 for t in word_toks if self.elong_pat.search(t)) / n
        caps_ratio  = sum(1 for t in word_toks if len(t)>=2 and t.isupper()) / n
        return [neg_ratio, exclam, elong_ratio, caps_ratio]
    def fit(self, X, y=None): return self
    def transform(self, X):
        feats = [self._one(x) for x in X]
        return csr_matrix(np.asarray(feats, dtype=float))

# 4) Combine to 6 features
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
feat_union = FeatureUnion([("pn2", PNFreqFeatures(normalize=True)),
                           ("sty4", TextExtraFeats())])

# 5) Models + CV on precision
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

lin_scaler = StandardScaler(with_mean=False)
densify    = FunctionTransformer(lambda X: X.toarray(), accept_sparse=True)

pipelines = {
    "LR":  Pipeline([("feat", feat_union), ("scaler", lin_scaler),
                     ("clf", LogisticRegression(max_iter=5000, solver="liblinear", random_state=RANDOM_STATE))]),
    "LinearSVC": Pipeline([("feat", feat_union), ("scaler", lin_scaler),
                           ("clf", LinearSVC(random_state=RANDOM_STATE))]),
    "KNN": Pipeline([("feat", feat_union), ("to_dense", densify),
                     ("scaler", StandardScaler(with_mean=True)),
                     ("clf", KNeighborsClassifier())]),
    "GaussianNB": Pipeline([("feat", feat_union), ("to_dense", densify),
                            ("scaler", StandardScaler(with_mean=True)),
                            ("clf", GaussianNB())]),
    "DecisionTree": Pipeline([("feat", feat_union), ("to_dense", densify),
                              ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE))]),
    "RandomForest": Pipeline([("feat", feat_union), ("to_dense", densify),
                              ("clf", RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))]),
}

param_grids = {
    "LR": {"clf__C": [0.1, 0.5, 1, 2, 3, 5]},
    "LinearSVC": {"clf__C": [0.1, 0.5, 1, 2, 3, 5]},
    "KNN": {"clf__n_neighbors": [3,5,7,11], "clf__weights": ["uniform","distance"]},
    "GaussianNB": {"clf__var_smoothing": [1e-9, 1e-8, 1e-7]},
    "DecisionTree": {"clf__max_depth": [3,5,10,None], "clf__min_samples_leaf": [1,5,10]},
    "RandomForest": {"clf__n_estimators": [200,400], "clf__max_depth": [None,5,10]},
}

def table_from_grid(name, grid):
    res = grid.cv_results_
    return pd.DataFrame({
        "Model":   [name]*len(res["params"]),
        "Params":  res["params"],
        "Precision(mean)": res["mean_test_score"],
        "Precision(std)":  res["std_test_score"],
    })

rows, best_name, best_model, best_prec = [], None, None, -1.0
for name, pipe in pipelines.items():
    grid = GridSearchCV(pipe, param_grids[name], scoring="precision", cv=skf, n_jobs=-1)
    grid.fit(X_all, y_all)
    rows.append(table_from_grid(name, grid))
    print(f"{name}  best_params={grid.best_params_}  best_precision={grid.best_score_:.6f}")
    if grid.best_score_ > best_prec:
        best_name, best_model, best_prec = name, grid.best_estimator_, grid.best_score_

cv6 = pd.concat(rows, ignore_index=True).sort_values("Precision(mean)", ascending=False).reset_index(drop=True)
print("\n=== 5-fold CV (6 features) — sorted by Precision(mean) ===")
print(cv6.to_string(index=False))
print(f"\nSelected winner: {best_name}  with CV precision = {best_prec:.6f}")



[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


LR  best_params={'clf__C': 1}  best_precision=0.704917
LinearSVC  best_params={'clf__C': 0.5}  best_precision=0.704731
KNN  best_params={'clf__n_neighbors': 11, 'clf__weights': 'distance'}  best_precision=0.702371
GaussianNB  best_params={'clf__var_smoothing': 1e-09}  best_precision=0.553584
DecisionTree  best_params={'clf__max_depth': 5, 'clf__min_samples_leaf': 5}  best_precision=0.735469
RandomForest  best_params={'clf__max_depth': 10, 'clf__n_estimators': 400}  best_precision=0.722124

=== 5-fold CV (6 features) — sorted by Precision(mean) ===
       Model                                                Params  Precision(mean)  Precision(std)
DecisionTree     {'clf__max_depth': 5, 'clf__min_samples_leaf': 5}         0.735469        0.011357
DecisionTree     {'clf__max_depth': 5, 'clf__min_samples_leaf': 1}         0.735467        0.010835
DecisionTree    {'clf__max_depth': 5, 'clf__min_samples_leaf': 10}         0.734881        0.011737
RandomForest      {'clf__max_depth': 10, 'clf_

In [None]:
# ============================================================
# Q7 — Six-feature pipeline WITHOUT leakage-safe cleaning
# (emoticons & sentiment hashtags are kept as-is)
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import re, pickle, numpy as np, pandas as pd
from collections import Counter, defaultdict

# 0) Data: NLTK twitter_samples (balanced)
import nltk
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples
from nltk.tokenize import TweetTokenizer

RANDOM_STATE = 42
pos_all = twitter_samples.strings('positive_tweets.json')
neg_all = twitter_samples.strings('negative_tweets.json')

# Use the full dataset; DO NOT deduplicate (to stay "non leakage-safe")
X_all = np.array(pos_all + neg_all, dtype=object)
y_all = np.array([1]*len(pos_all) + [0]*len(neg_all), dtype=int)
print(f"Total samples (with possible duplicates): {len(X_all)}")

# 1) Raw tokenization (NO removal of emoticons or hashtags)
#    We only normalize URLs and numbers to stable tokens; keep case lowered.
url_pat = re.compile(r'https?://\S+|www\.\S+')
num_pat = re.compile(r'\b\d+\b')
ttok_raw = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

def raw_tokenize(text: str):
    text = text if isinstance(text, str) else str(text)
    text = url_pat.sub(" URL ", text)
    text = num_pat.sub(" NUM ", text)
    # keep 'rt', keep emoticons, keep hashtags as-is
    toks = ttok_raw.tokenize(text)
    # keep alphabetic or mixed tokens (allow hashtags/emoticons)
    return [t for t in toks if any(ch.isalnum() for ch in t)]

# 2) Feature A: 2 class-conditional frequency features (POS/NEG)
#    IMPORTANT: Estimated per CV training fold (fit) to avoid target leakage across folds.
#    NO removal of emoticons/hashtags here; they are allowed to contribute.
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin

class PNFreqFeatures(BaseEstimator, TransformerMixin):
    """
    Produce 2 features per document:
      - pos_freq_sum: sum of token counts from the positive class
      - neg_freq_sum: sum of token counts from the negative class
    Normalized by N = n_train * len(s) (as in your course Q3 variant).
    """
    def __init__(self, normalize=True):
        self.normalize = normalize

    def fit(self, X, y=None):
        pos_c, neg_c = Counter(), Counter()
        for text, lab in zip(X, y):
            toks = raw_tokenize(text)
            (pos_c if lab==1 else neg_c).update(toks)
        self.pos_freq_ = defaultdict(float, {k: float(v) for k,v in pos_c.items()})
        self.neg_freq_ = defaultdict(float, {k: float(v) for k,v in neg_c.items()})
        self.n_train_  = float(len(X))
        return self

    def transform(self, X):
        rows = []
        for text in X:
            toks = raw_tokenize(text)
            pos_sum = sum(self.pos_freq_[t] for t in toks)
            neg_sum = sum(self.neg_freq_[t] for t in toks)
            if self.normalize:
                L = max(len(toks), 1)
                N = self.n_train_ * L
                pos_sum /= N; neg_sum /= N
            rows.append([pos_sum, neg_sum])
        return csr_matrix(np.asarray(rows, dtype=float))

# 3) Feature B: 4 stylistic/emphasis features (computed on raw text)
#    - negation_ratio
#    - exclamation_intensity (cap 3, scale to [0,1])
#    - elongation_ratio (repeated chars >=3, e.g., 'soooo')
#    - allcaps_ratio (tokens ALL CAPS, len>=2) -- case-insensitive tokenizer above,
#      so use a case-preserving one here.
class TextExtraFeats(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.neg_words = set([
            "not","no","never","none","cannot","can't","dont","don't","won't","wont",
            "isn't","aint","ain't","hasn't","havent","haven't","wasn't","weren't","nor",
            "n't","shouldn't","couldn't","wouldn't","doesn't","didn't","hadn't"
        ])
        self.ttok_cap = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)
        self.elong_pat = re.compile(r'(.)\1{2,}', re.UNICODE)

    def _one(self, text: str):
        text = text if isinstance(text, str) else str(text)
        exclam = min(text.count('!'), 3) / 3.0
        toks = self.ttok_cap.tokenize(text)
        word_toks = [t for t in toks if any(ch.isalpha() for ch in t)]
        n = max(len(word_toks), 1)
        neg_ratio   = sum(1 for t in word_toks if t.lower() in self.neg_words) / n
        elong_ratio = sum(1 for t in word_toks if self.elong_pat.search(t)) / n
        caps_ratio  = sum(1 for t in word_toks if len(t)>=2 and t.isupper()) / n
        return [neg_ratio, exclam, elong_ratio, caps_ratio]

    def fit(self, X, y=None): return self
    def transform(self, X):
        feats = [self._one(x) for x in X]
        return csr_matrix(np.asarray(feats, dtype=float))

# 4) Combine to 6 features: (2 freq + 4 stylistic)
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
feat_union_6 = FeatureUnion([
    ("pn2", PNFreqFeatures(normalize=True)),
    ("sty4", TextExtraFeats())
])

# 5) Models + 5-fold CV (precision)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# For linear models, we can scale sparse safely (with_mean=False).
# For dense models (KNN/NB/Tree/RF), densify and optionally scale.
lin_scaler = StandardScaler(with_mean=False)
densify    = FunctionTransformer(lambda X: X.toarray(), accept_sparse=True)

pipelines = {
    "LR":  Pipeline([("feat", feat_union_6), ("scaler", lin_scaler),
                     ("clf", LogisticRegression(max_iter=5000, solver="liblinear", random_state=RANDOM_STATE))]),
    "LinearSVC": Pipeline([("feat", feat_union_6), ("scaler", lin_scaler),
                           ("clf", LinearSVC(random_state=RANDOM_STATE))]),
    "KNN": Pipeline([("feat", feat_union_6), ("to_dense", densify),
                     ("scaler", StandardScaler(with_mean=True)),
                     ("clf", KNeighborsClassifier())]),
    "GaussianNB": Pipeline([("feat", feat_union_6), ("to_dense", densify),
                            ("scaler", StandardScaler(with_mean=True)),
                            ("clf", GaussianNB())]),
    "DecisionTree": Pipeline([("feat", feat_union_6), ("to_dense", densify),
                              ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE))]),
    "RandomForest": Pipeline([("feat", feat_union_6), ("to_dense", densify),
                              ("clf", RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))]),
}

param_grids = {
    "LR": {"clf__C": [0.1, 0.5, 1, 2, 3, 5]},
    "LinearSVC": {"clf__C": [0.1, 0.5, 1, 2, 3, 5]},
    "KNN": {"clf__n_neighbors": [3,5,7,11], "clf__weights": ["uniform","distance"]},
    "GaussianNB": {"clf__var_smoothing": [1e-9, 1e-8, 1e-7]},
    "DecisionTree": {"clf__max_depth": [3,5,10,None], "clf__min_samples_leaf": [1,5,10]},
    "RandomForest": {"clf__n_estimators": [200,400], "clf__max_depth": [None,5,10]},
}

def cv_table(name, grid):
    res = grid.cv_results_
    return pd.DataFrame({
        "Model":   [name]*len(res["params"]),
        "Params":  res["params"],
        "Precision(mean)": res["mean_test_score"],
        "Precision(std)":  res["std_test_score"],
    })

rows, best_name, best_model, best_prec = [], None, None, -1.0
for name, pipe in pipelines.items():
    grid = GridSearchCV(pipe, param_grids[name], scoring="precision", cv=skf, n_jobs=-1)
    grid.fit(X_all, y_all)
    rows.append(cv_table(name, grid))
    print(f"{name:>11s}  best_params={grid.best_params_}  best_precision={grid.best_score_:.6f}")
    if grid.best_score_ > best_prec:
        best_name, best_model, best_prec = name, grid.best_estimator_, grid.best_score_

cv6_raw = pd.concat(rows, ignore_index=True).sort_values("Precision(mean)", ascending=False).reset_index(drop=True)
print("\n=== 5-fold CV (6 features, NO leakage-safe cleaning) — sorted by Precision(mean) ===")
print(cv6_raw.to_string(index=False))
print(f"\nSelected winner: {best_name}  with CV precision = {best_prec:.6f}")



[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


Total samples (with possible duplicates): 10000
         LR  best_params={'clf__C': 1}  best_precision=0.722041
  LinearSVC  best_params={'clf__C': 0.1}  best_precision=0.722127
        KNN  best_params={'clf__n_neighbors': 11, 'clf__weights': 'uniform'}  best_precision=0.712981
 GaussianNB  best_params={'clf__var_smoothing': 1e-09}  best_precision=0.555486
DecisionTree  best_params={'clf__max_depth': 5, 'clf__min_samples_leaf': 1}  best_precision=0.733317
RandomForest  best_params={'clf__max_depth': 10, 'clf__n_estimators': 200}  best_precision=0.729796

=== 5-fold CV (6 features, NO leakage-safe cleaning) — sorted by Precision(mean) ===
       Model                                                Params  Precision(mean)  Precision(std)
DecisionTree     {'clf__max_depth': 5, 'clf__min_samples_leaf': 1}         0.733317        0.023321
DecisionTree     {'clf__max_depth': 5, 'clf__min_samples_leaf': 5}         0.732797        0.023536
DecisionTree    {'clf__max_depth': 5, 'clf__min_sampl

PicklingError: Can't pickle <function <lambda> at 0x000002619A910E00>: attribute lookup <lambda> on __main__ failed