In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
nelgiriyewithana_mcdonalds_store_reviews_path = kagglehub.dataset_download('nelgiriyewithana/mcdonalds-store-reviews')

print('Data source import complete.')


# 1. Setup

In [None]:
# --- Main imports, run always ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Prepare the data with real reviews

In [None]:
# --- Prepare the data ---
df_full = pd.read_csv('/kaggle/input/mcdonalds-store-reviews/McDonald_s_Reviews.csv', encoding = 'latin-1')
df_full['word_count'] = df_full['review'].fillna('').apply(lambda x: len(x.split()))
df_full = df_full [['review', 'rating','word_count']]

"""
# --- Extract city and state ---
tmp = df_full['store_address'].str.extract(
    r',\s*([A-Za-z\s]+?),\s*([A-Z]{2})(?:\s+\d{5}(?:-\d{4})?)?(?:\s*,.*)?$'
)
df_full['city'] = tmp[0].str.strip()
df_full['state'] = tmp[1]
"""

df_full.dropna(inplace=True)

df_full.info()

In [None]:
df_full['word_count'].describe()

In [None]:
# --- Exclude reviews under 15 and over 54 words ---
df_interim = (
    df_full
    .query("15 <= word_count <= 54")
    .reset_index(drop=True)
)

df_interim['word_count_bin'] = pd.cut(
    df_interim['word_count'],
    bins=[15, 25, 35, 45, 55],
    right=False
)

# --- Extract min and max from the interval objects ---
df_interim['word_count_min'] = df_interim['word_count_bin'].apply(lambda x: x.left).astype(int)
df_interim['word_count_max'] = df_interim['word_count_bin'].apply(lambda x: x.right - 1).astype(int)
df_interim['word_count_bin_str'] = df_interim['word_count_bin'].astype(str) # get string, otherwise Gretel does not accept
df_interim = df_interim.drop(columns=['word_count_bin'])

# --- Create train and test sets ---
train_frames, test_frames = [], []

for (wb, r), g in df_interim.groupby(['word_count_bin_str', 'rating']):
    if len(g) < 35:
        raise ValueError(f"({wb}, {r}) has only {len(g)} rows")

    # 25 for train, 10 for test
    g_shuffled = g.sample(frac=1, random_state=42)  # shuffle once
    train_frames.append(g_shuffled.iloc[:25])
    test_frames .append(g_shuffled.iloc[25:35])

df_real_train = pd.concat(train_frames).reset_index(drop=True)
df_real_test  = pd.concat(test_frames ).reset_index(drop=True)

df_real_train.to_csv("df_train.csv", index=False)
df_real_test.to_csv("df_test.csv", index=False)

print("Train shape:", df_real_train.shape)
print("Test shape:", df_real_test.shape)

In [None]:
df_real_train.info()

### Generate synthetic responses with Gretel.AI (API call, key required)

In [None]:
!pip install -q gretel-client

# Ignore error messages

In [None]:
from gretel_client.navigator_client import Gretel
from gretel_client.data_designer import columns as C
from gretel_client.data_designer import params as P
from gretel_client.data_designer.params import GenerationParameters, ModelConfig

gretel_api_key = user_secrets.get_secret("gretel_api_key")
gretel = Gretel(api_key=gretel_api_key)

In [None]:
# --- Sampler columns for the review topic and style ---
def add_common_columns(designer):
    designer.add_column(
        C.SamplerColumn(
            name="topic",
            type=P.SamplerType.CATEGORY,
            params=P.CategorySamplerParams(values=["service speed","food quality","order accuracy", "cleanliness","staff attitude","convenience"])
        )
    )
    designer.add_column(
        C.SamplerColumn(
            name="style",
            type=P.SamplerType.CATEGORY,
            params=P.CategorySamplerParams(values=["direct","storytelling","descriptive", "comparative","evaluative"])
        )
    )

# --- Wrapper for synthetic data designer ---
def generate_bucketed(df_seed, prefix: str):
    """
    Generate synthetic reviews for every bucket in df_seed.
    prefix is used for alias names so train/test runs do not clash.
    Returns the concatenated synthetic frame.
    """
    df_synth = df_seed.drop(columns='review')   # drop real review
    synthetic_frames = []

    for bin_str, (tok_min, tok_max) in bucket_map.items():
        df_bin = df_synth[df_synth['word_count_bin_str'] == bin_str]
        if df_bin.empty:
            continue

        alias = f"{prefix}_{bin_str.replace('-', '_')}"

        prompt_tpl = (
            "Write a {{ rating }} review of a fast-food restaurant.\n"
            "- Length: {{ word_count_min }}–{{ word_count_max }} words.\n"
            "- Focus on: {{ topic }}\n"
            "- Style: {{ style }}\n"
            "- Do NOT mention the city, state, or restaurant name.\n"
            "Review:"
        )

        cfg = ModelConfig(
            alias=alias,
            model_name="bedrock/meta-llama/Llama-3.1-70B-Instruct",
            generation_parameters=GenerationParameters(temperature=0.9, max_tokens=tok_max, top_p=0.90)
        )

        designer = (
            gretel.data_designer
                   .new(model_suite="llama-3.x", model_configs=[cfg])
                   .with_seed_dataset(df_bin)
        )

        add_common_columns(designer)

        designer.add_column(
            C.LLMTextColumn(
                name="review",
                output_type="text",
                model_alias=alias,
                prompt=prompt_tpl,
                system_prompt="You recently visited a McDonald's restaurant."
            )
        )

        run = designer.create(num_records=len(df_bin), wait_until_done=True)
        synthetic_frames.append(run.dataset.df)

    return pd.concat(synthetic_frames, ignore_index=True)

bucket_map = {
    "[15, 25)": (20, 35),
    "[25, 35)": (33, 48),
    "[35, 45)": (46, 62),
    "[45, 55)": (59, 75)
}

In [None]:
# --- run for train & test ---
df_synth_train = generate_bucketed(df_real_train, prefix="train")
df_synth_test  = generate_bucketed(df_real_test,  prefix="test")

# --- add source labels ---
df_real_train['source'] = 'real'
df_real_test['source']  = 'real'
df_synth_train['source'] = 'synthetic'
df_synth_test['source']  = 'synthetic'

# --- final merged sets ---
df_train = pd.concat([df_real_train, df_synth_train], ignore_index=True)
df_test  = pd.concat([df_real_test , df_synth_test ], ignore_index=True)

# --- recompute word counts ---
for df in (df_train, df_test):
    df['word_count'] = df['review'].fillna('').str.split().str.len()

In [None]:
# Save for later
df_train.to_csv("df_train.csv", index=False)
df_test.to_csv("df_test.csv", index=False)

### Reviews length distributions

In [None]:
# Optional: re-load the data to reproduce the plots
df_train = pd.read_csv("df_train.csv")
df_test = pd.read_csv("df_test.csv")

In [None]:
plt.style.use('default')
fig, axes = plt.subplots(1, 2, figsize=(18, 4))

sns.histplot(df_train['word_count'][df_train['source'] == 'real'], kde=True, bins=39, ax=axes[0], linewidth=1)
axes[0].set_title('Distribution of Word Count | human reviews | train', loc='left', fontfamily='monospace')

sns.histplot(df_train['word_count'][df_train['source'] == 'synthetic'], kde=True, bins=39, ax=axes[1], linewidth=1)
axes[1].set_title('Distribution of Word Count | synthetic reviews | train', loc='left', fontfamily='monospace')

plt.tight_layout()

fig.axes[0].figure.savefig('human_reviews_wordcount_train_data.png')
fig.axes[1].figure.savefig('synthetic_reviews_wordcount_train_data.png')

plt.show()

In [None]:
plt.style.use('default')
fig, axes = plt.subplots(1, 2, figsize=(18, 4))

sns.histplot(df_test['word_count'][df_test['source'] == 'real'], kde=True, bins=39, ax=axes[0], linewidth=1)
axes[0].set_title('Distribution of Word Count | human reviews | test', loc='left', fontfamily='monospace')

sns.histplot(df_test['word_count'][df_test['source'] == 'synthetic'], kde=True, bins=39, ax=axes[1], linewidth=1)
axes[1].set_title('Distribution of Word Count | synthetic reviews | test', loc='left', fontfamily='monospace')

plt.tight_layout()
plt.show()

# 2. Analysis

### Sliding-window coherence

In [None]:
# --- Optional: re-load the data ---
df_train = pd.read_csv("df_train.csv")

In [None]:
# --- Make separate dataframe for analysis ---
df_analysis = df_train[['review','rating','source','word_count','word_count_bin_str']]
df_analysis.info()

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

# Ignore errors

In [None]:
# --- Sliding window functions ---
def create_sliding_windows(tokens, window_size=3, step=1):
    windows = []
    for i in range(0, len(tokens) - window_size + 1, step):
        windows.append(tokens[i:i + window_size])
    return windows

def compute_coherence_scores(embeddings):
    if len(embeddings) < 2:
        return [1.0]  # Single window case

    sim_list = []
    for i in range(len(embeddings) - 1):
        sim = cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0]
        sim_list.append(sim)
    return sim_list

def analyze_review_coherence(review_text):
    # Handle missing or empty reviews
    if pd.isna(review_text) or len(str(review_text).strip()) == 0:
        return {
            'num_windows': 0,
            'coherence_scores': [],
            'mean_coherence': np.nan,
            'coherence_range': np.nan,
            'word_count': 0
        }

    # -- Tokenize the review --
    tokens = word_tokenize(str(review_text).lower())
    word_count = len(tokens)

    """
    # For very short reviews (less than window_size), return basic metrics
    if word_count < window_size:
        return {
            'num_windows': 0,
            'coherence_scores': [],
            'mean_coherence': 1.0,  # Assume coherent if too short to analyze
            'coherence_range': 0.0,
            'word_count': word_count
        }
    """

    # -- Create sliding windows --
    windows = create_sliding_windows(tokens, window_size, step)
    num_windows = len(windows)

    # -- Convert windows to text and get embeddings --
    window_texts = [' '.join(window) for window in windows]
    embeddings = model.encode(window_texts, convert_to_numpy=True)

    # -- Coherence scores --
    coherence_scores = compute_coherence_scores(embeddings)

    # -- Metrics --
    if coherence_scores:
        median_coherence   = float(np.median(coherence_scores))
        coherence_IQR  = float(np.percentile(coherence_scores, 75) - np.percentile(coherence_scores, 25)) # IQR
    else:
        mean_coherence   = 1.0
        coherence_range  = 0.0

    return {
        'num_windows': num_windows,
        'coherence_scores': coherence_scores,
        'median_coherence': median_coherence,
        'coherence_IQR': coherence_IQR,
        'word_count': word_count
    }

In [None]:
# --- Sliding window parameters ---
window_size = 7
step = 4

# --- Process all reviews ---
print("Processing short reviews for coherence analysis...")
coherence_results = df_analysis['review'].apply(analyze_review_coherence)

# --- Extract metrics into separate columns ---
df_analysis['num_windows'] = coherence_results.apply(lambda x: x['num_windows'])
df_analysis['median_coherence'] = coherence_results.apply(lambda x: x['median_coherence'])
df_analysis['coherence_IQR'] = coherence_results.apply(lambda x: x['coherence_IQR'])

In [None]:
# --- Analysis summary ---
print("\nCoherence summary (human vs synthetic)")
print("--------------------------------------")
summary = df_analysis.groupby('source')[['median_coherence', 'coherence_IQR', 'num_windows']].describe()
print(summary.round(3))

# --- Mann-Whitney test ---
from scipy.stats import mannwhitneyu
h = df_analysis.loc[df_analysis['source']=='real',   'median_coherence']
s = df_analysis.loc[df_analysis['source']=='synthetic','median_coherence']
stat, p = mannwhitneyu(h, s, alternative='two-sided')
print(f"\nMedian coherence difference (human vs synthetic)  U={stat:.1f}, p={p:.3f}")

In [None]:
# --- Key plots ---
df_clean_for_plotting = df_analysis[df_analysis['median_coherence'].notna()].copy()

plt.style.use('default')
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
colors = {'real': 'lightblue', 'synthetic': 'moccasin'}

# -- Box plot --
sns.boxplot(x='source', y='median_coherence', data=df_clean_for_plotting, ax=axes[0], palette=colors,linewidth=1)
axes[0].set_title('Box Plots', loc='left', fontfamily='monospace')
axes[0].grid(True, alpha=0.3)

# -- Violin plot --
sns.violinplot(x='source', y='median_coherence', data=df_clean_for_plotting, ax=axes[1], palette=colors,linewidth=1)
axes[1].set_title('Distribution Shapes', loc='left', fontfamily='monospace')
axes[1].grid(True, alpha=0.3)

# -- KDE plot --
sns.kdeplot(data=df_clean_for_plotting, x='median_coherence', hue='source', common_norm=False, fill=True, ax=axes[2],linewidth=1)
axes[2].set_title('Kernel Density Estimations', loc='left', fontfamily='monospace')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()

fig.axes[0].figure.savefig('sliding-window-coherence_boxplot.png')
fig.axes[1].figure.savefig('sliding-window-coherence_violinplot.png')
fig.axes[2].figure.savefig('sliding-window-coherence_kde.png')

plt.show()

Take-outs:
Synthetic reviews exhibiting tighter local coherence scores. Neural system outputs score higher on narrow, window-based similarity metrics because they stay on one topic.



In [None]:
# --- Save for later ---
df_analysis.to_csv("df_analysis.csv", index=False)

### POS tag enthropy

*Unigram* entropy used to measure grammatical *diversity*  
*Bigram* entropy used to measure grammatical *predictability*

Intuition:
- no difference in unigram entropy between human and synthetic text
- some difference in bi-gram entropy between synthetic text vs human text

In [None]:
# --- Optional: re-load the data ---
df_analysis = pd.read_csv("df_analysis.csv")

In [None]:
!pip install -q spacy
!python -m spacy download en_core_web_sm

import spacy, collections, math
nlp = spacy.load("en_core_web_sm")

In [None]:
# --- Shannon entropy ---
def pos_entropy(text: str) -> float:
    """Unigram POS entropy (bits/token)."""
    doc = nlp(text)
    counts = collections.Counter(token.pos_ for token in doc)
    total  = sum(counts.values())
    return -sum((c/total) * math.log2(c/total) for c in counts.values()) if total else 0.0

# --- Conditional entropy ---
def pos_bigram_entropy(text: str) -> float:
    """Bigram POS entropy H(Tₙ | Tₙ₋₁).  Returns bits."""
    doc = nlp(text)
    tags = [tok.pos_ for tok in doc]
    if len(tags) < 2:
        return 0.0

    bigram_counts = collections.Counter(zip(tags, tags[1:]))
    prev_counts   = collections.Counter(tags[:-1])

    H = 0.0
    for (prev, curr), joint in bigram_counts.items():
        p_joint = joint / (len(tags) - 1)
        p_curr_given_prev = joint / prev_counts[prev]
        H += p_joint * math.log2(p_curr_given_prev)
    return -H

In [None]:
# --- Run analysis ---
df_analysis['pos_entropy'] = df_analysis['review'].astype(str).apply(pos_entropy)
df_analysis['pos_bigram_entropy'] = df_analysis['review'].astype(str).apply(pos_bigram_entropy)

In [None]:
from scipy.stats import mannwhitneyu

real_uni  = df_analysis.loc[df_analysis['source']=='real',      'pos_entropy']
synth_uni = df_analysis.loc[df_analysis['source']=='synthetic', 'pos_entropy']
real_bi   = df_analysis.loc[df_analysis['source']=='real',      'pos_bigram_entropy']
synth_bi  = df_analysis.loc[df_analysis['source']=='synthetic', 'pos_bigram_entropy']

# --- Descriptive stats ---
print("Unigram entropy")
print(f"  Real      : n={len(real_uni):3d}, mean={real_uni.mean():.3f}, sd={real_uni.std():.3f}")
print(f"  Synthetic : n={len(synth_uni):3d}, mean={synth_uni.mean():.3f}, sd={synth_uni.std():.3f}")

print("Bigram entropy")
print(f"  Real      : n={len(real_bi):3d}, mean={real_bi.mean():.3f}, sd={real_bi.std():.3f}")
print(f"  Synthetic : n={len(synth_bi):3d}, mean={synth_bi.mean():.3f}, sd={synth_bi.std():.3f}")

# --- Two-sided Mann–Whitney ---
u_uni, p_uni = mannwhitneyu(real_uni, synth_uni, alternative='two-sided')
u_bi,  p_bi  = mannwhitneyu(real_bi,  synth_bi,  alternative='two-sided')

print("\nTests")
print(f"Unigram entropy: p={p_uni:.3g}")
print(f"Bigram entropy : p={p_bi:.3g}")

def rank_biserial(u_stat, n1, n2):
    r = 1 - (2 * u_stat) / (n1 * n2)
    return r

# --- Rank bi-serial correlation (effect sizes) ---
r_uni = rank_biserial(u_uni, len(real_uni), len(synth_uni))
r_bi  = rank_biserial(u_bi,  len(real_bi),  len(synth_bi))
print("\nEffect sizes")
print(f"Rank Biserial (Unigram): {r_uni:.3f}")
print(f"Rank Biserial (Bigram) : {r_bi:.3f}")

In [None]:
# --- Key charts ---
plt.style.use('default')
fig, axes = plt.subplots(1, 2, figsize=(18, 4))
colors = {'real': 'steelblue', 'synthetic': 'orange'}

sns.histplot(data=df_analysis, x='pos_entropy', hue='source', kde=True, bins=20, ax=axes[0], palette=colors,linewidth=0.5)
axes[0].set_title('Unigram POS Entropy Distribution', loc='left', fontfamily='monospace')

sns.histplot(data=df_analysis, x='pos_bigram_entropy', hue='source', kde=True, bins=20, ax=axes[1], palette=colors,linewidth=0.5)
axes[1].set_title('Bigram POS Entropy Distribution', loc='left', fontfamily='monospace')

plt.tight_layout()

fig.axes[0].figure.savefig('unigram_entropy.png')
fig.axes[1].figure.savefig('bigram_entropy.png')

plt.show()

In [None]:
# Save for later
df_analysis.to_csv("df_analysis.csv", index=False)

# 3. Predictive model

In [None]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

In [None]:
# --- Optional: re-load train data ---
df_analysis = pd.read_csv("df_analysis.csv")
df_analysis.info()

In [None]:
# --- Data preparation ---
X = df_analysis[['median_coherence',
                 'coherence_IQR',
                 'pos_entropy',
                 'pos_bigram_entropy']]
y = (df_analysis['source'] == 'synthetic').astype(int)

### Logistic regression

In [None]:
# --- Logistic regression setup ---
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

clf_log = Pipeline([
    ('scale', StandardScaler()),
    ('logit', LogisticRegression(max_iter=5000))
])

y_pred_log = cross_val_predict(clf_log, X, y, cv=cv, method='predict')
print(classification_report(y, y_pred_log, target_names=['real', 'synthetic']))

In [None]:
# --- Log misclassification examples ---
df_analysis['pred_source_log'] = np.where(y_pred_log == 1, 'synthetic', 'real')
false_synthetic = df_analysis[(df_analysis['source'] == 'real') & (df_analysis['pred_source_log'] == 'synthetic')].head(5)
false_real = df_analysis[(df_analysis['source'] == 'synthetic') & (df_analysis['pred_source_log'] == 'real')].head(5)

print("Real → predicted synthetic")
for idx, row in false_synthetic.iterrows():
    print(f"Idx {idx}: {row['review'][:200]}…")

print("\nSynthetic → predicted real")
for idx, row in false_real.iterrows():
    print(f"Idx {idx}: {row['review'][:200]}…")

In [None]:
err_bin = df_analysis[df_analysis['source'] != df_analysis['pred_source_log']]
avg = err_bin.groupby(['word_count_bin_str', 'pred_source_log']).size().unstack(fill_value=0)
print(avg)

# real → the review was synthetic but the model called it real (false-real errors).
# synthetic → the review was real but the model called it synthetic (false-synthetic errors).

In [None]:
err_rating = df_analysis[df_analysis['source'] != df_analysis['pred_source_log']]
avg = err_rating.groupby(['rating', 'pred_source_log']).size().unstack(fill_value=0)
print(avg)

# real → the review was synthetic but the model called it real (false-real errors).
# synthetic → the review was real but the model called it synthetic (false-synthetic errors).

In [None]:
# --- Confusion matrix ---
cm_log = confusion_matrix(y, y_pred_log, labels=[0, 1])
disp = ConfusionMatrixDisplay(cm_log, display_labels=['real', 'synthetic'])

fig, ax = plt.subplots(figsize=(4, 4))
disp.plot(ax=ax, cmap='Blues', colorbar=False)

ax.set_title("Confusion Matrix", fontfamily='monospace', fontsize=10)
ax.set_xlabel("Predicted Label", fontfamily='monospace', fontsize=10)
ax.set_ylabel("True Label", fontfamily='monospace', fontsize=10)

ax.tick_params(labelsize=8)
for label in ax.get_xticklabels() + ax.get_yticklabels():
    label.set_fontfamily('monospace')

plt.savefig('confusion_matrix_log.png', dpi=300, bbox_inches='tight')
plt.title("Confusion matrix")
plt.show()

In [None]:
# --- ROC curve / AUC ---
y_prob_log = cross_val_predict(clf_log, X, y, cv=cv, method='predict_proba')[:, 1]
fpr, tpr, _ = roc_curve(y, y_prob_log)
auc = roc_auc_score(y, y_prob_log)

print(f"ROC AUC = {auc:.3f}")

plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate", fontfamily='monospace', fontsize=10)
plt.ylabel("True Positive Rate", fontfamily='monospace', fontsize=10)
plt.title("ROC curve", fontfamily='monospace', fontsize=12, loc='left')
plt.legend()
plt.show()

### Logistic Regression - Recursive Feature Elimination with Cross-Validation

In [None]:
# --- Logistic regression with RFECV setup ---
clf_log_cv = Pipeline([
    ('scale', StandardScaler()),
    ('clf', LogisticRegressionCV(cv=10, penalty='l2', solver='liblinear'))
])
selector = RFECV(clf_log_cv, cv=5, scoring='accuracy',importance_getter='named_steps.clf.coef_')
selector.fit(X, y)
print("Optimal number of features:", selector.n_features_)
print("Ranking:", selector.ranking_)
print("Selected indices:", selector.support_)

X_reduced = X.iloc[:, selector.support_]
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
clf_log_optimal = Pipeline([
    ('scale', StandardScaler()),
    ('logit', LogisticRegression(max_iter=5000))
])

# --- Get predictions and F1 scores for each fold ---
y_pred_log_optimal = cross_val_predict(clf_log_optimal, X_reduced, y, cv=cv, method='predict')
cv_f1_scores = cross_val_score(clf_log_optimal, X_reduced, y, cv=cv,
                               scoring='f1', # automatically uses pos_label='1' which maps to 'synthetic'
                               )

print(classification_report(y, y_pred_log_optimal, target_names=['real', 'synthetic']))

# --- Feature coefficients ---
clf_log_optimal.fit(X_reduced, y)
coefficients = clf_log_optimal.named_steps['logit'].coef_[0]
feature_names = X_reduced.columns

print("\n=== Feature Coefficients ===")
for name, coef in zip(feature_names, coefficients):
    direction = "increases" if coef > 0 else "decreases"
    print(f"{name}: {coef:.3f} ({direction} synthetic probability)")

# --- Cross-Validation Stability ---
print(f"\n=== Cross-Validation Stability ===")
print(f"F1-scores across folds: {[f'{score:.3f}' for score in cv_f1_scores]}")
print(f"Mean F1: {cv_f1_scores.mean():.3f} ± {cv_f1_scores.std():.3f}")
print(f"Range: {cv_f1_scores.min():.3f} - {cv_f1_scores.max():.3f}")

In [None]:
# --- Confusion matrix ---
cm_log_optimal = confusion_matrix(y, y_pred_log, labels=[0, 1])
disp = ConfusionMatrixDisplay(cm_log_optimal, display_labels=['real', 'synthetic'])

fig, ax = plt.subplots(figsize=(4, 4))
disp.plot(ax=ax, cmap='Blues', colorbar=False)

ax.set_title("Confusion Matrix", fontfamily='monospace', fontsize=10)
ax.set_xlabel("Predicted Label", fontfamily='monospace', fontsize=10)
ax.set_ylabel("True Label", fontfamily='monospace', fontsize=10)

ax.tick_params(labelsize=8)
for label in ax.get_xticklabels() + ax.get_yticklabels():
    label.set_fontfamily('monospace')

plt.savefig('confusion_matrix_log_optimal.png', dpi=300, bbox_inches='tight')
plt.title("Confusion matrix")
plt.show()

### Support Vector Classifier

In [None]:
# --- Support Vector Classifier setup ---
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

clf_svc = Pipeline([
    ('scale', StandardScaler()),
    ('svm',  SVC(kernel='rbf', C=1.0, probability=True, random_state=42))
])

y_pred_svc = cross_val_predict(clf_svc, X, y, cv=cv, method='predict')
print(classification_report(y, y_pred_svc, target_names=['real', 'synthetic']))

In [None]:
# --- SVC misclassification examples ---
df_analysis['pred_source_svc'] = np.where(y_pred_svc == 1, 'synthetic', 'real')
false_synthetic = df_analysis[(df_analysis['source'] == 'real') & (df_analysis['pred_source_svc'] == 'synthetic')].head(5)
false_real = df_analysis[(df_analysis['source'] == 'synthetic') & (df_analysis['pred_source_svc'] == 'real')].head(5)

print("Real → predicted synthetic")
for idx, row in false_synthetic.iterrows():
    print(f"Idx {idx}: {row['review'][:200]}…")

print("\nSynthetic → predicted real")
for idx, row in false_real.iterrows():
    print(f"Idx {idx}: {row['review'][:200]}…")

In [None]:
# --- Misclassifications by bucket ---
err = df_analysis[df_analysis['source'] != df_analysis['pred_source_svc']]
avg = err.groupby(['word_count_bin_str', 'pred_source_svc']).size().unstack(fill_value=0)
print(avg)

# real → the review was synthetic but the model called it real (false-real errors).
# synthetic → the review was real but the model called it synthetic (false-synthetic errors).

In [None]:
# --- Confusion matrix ---
cm_svc = confusion_matrix(y, y_pred_svc, labels=[0, 1])
disp = ConfusionMatrixDisplay(cm_svc, display_labels=['real', 'synthetic'])

fig, ax = plt.subplots(figsize=(4, 4))
disp.plot(ax=ax, cmap='Blues', colorbar=False)

ax.set_title("Confusion Matrix", fontfamily='monospace', fontsize=10)
ax.set_xlabel("Predicted Label", fontfamily='monospace', fontsize=10)
ax.set_ylabel("True Label", fontfamily='monospace', fontsize=10)

ax.tick_params(labelsize=8)
for label in ax.get_xticklabels() + ax.get_yticklabels():
    label.set_fontfamily('monospace')

plt.savefig('confusion_matrix_svc.png', dpi=300, bbox_inches='tight')

plt.title("Confusion matrix")
plt.show()

In [None]:
# --- Train final model ---
clf_svc.fit(X, y)

### Out of sample prediction

In [None]:
# --- Optional: re-load test data ---
df_analysis_test = pd.read_csv("df_analysis_test.csv")
df_analysis_test.info()

In [None]:
# --- Create a working copy of test dataset ---
df_analysis_test = df_test.copy()

In [None]:
# --- Running this cell requires running cells from Analysis section to activate helper functions

# --- Run sliding-window coherence analysis ---
coherence_results = df_analysis_test['review'].apply(analyze_review_coherence)
df_analysis_test['num_windows'] = coherence_results.apply(lambda x: x['num_windows'])
df_analysis_test['median_coherence'] = coherence_results.apply(lambda x: x['median_coherence'])
df_analysis_test['coherence_IQR'] = coherence_results.apply(lambda x: x['coherence_IQR'])

# --- Run unigram and bigram entropy analysis ---
df_analysis_test['pos_entropy'] = df_analysis_test['review'].astype(str).apply(pos_entropy)
df_analysis_test['pos_bigram_entropy'] = df_analysis_test['review'].astype(str).apply(pos_bigram_entropy)

# --- Save for later ---
df_analysis_test.to_csv("df_analysis_test.csv", index=False)

In [None]:
# --- OPTION 1: Full test set, no downsampling of synthetic records ---
X_test_reduced = df_adjusted_test[['pos_entropy', 'pos_bigram_entropy']] # X_test_reduced refers to reduced set of features
y_test = (df_analysis_test['source'] == 'synthetic').astype(int)
print("X shape", X_test.shape)

In [None]:
# --- OPTION 2: Realistic test set, with downsampling of synthetic records ---
real_rows  = df_analysis_test[df_analysis_test['source'] == 'real']
synth_rows = df_analysis_test[df_analysis_test['source'] == 'synthetic']
synth_sub = synth_rows.sample(n=100, random_state=42)
df_adjusted_test = pd.concat([real_rows, synth_sub], ignore_index=True)

X_test_reduced = df_adjusted_test[['pos_entropy', 'pos_bigram_entropy']] # X_test_reduced refers to reduced set of features
y_test = (df_adjusted_test['source'] == 'synthetic').astype(int)

print("X shape (4 features)", X_test.shape)
print("X shape (2 features)", X_test_reduced.shape)

In [None]:
# --- Predict with optimal Log model ---
log_pred = clf_log_optimal.predict(X_test_reduced)
log_report = classification_report(y_test, log_pred, target_names=['real','synthetic'])

# --- Evaluate ---

print("Logistic Regression")
print(log_report)
print("Accuracy:", accuracy_score(y_test, log_pred))

with open('model_evaluation_log_optimal.txt', 'w') as f:
    f.write("Logistic Regression\n")
    f.write(log_report + "\n")
    f.write(f"Accuracy: {accuracy_score(y_test, log_pred)}\n\n")

In [None]:
# --- Threshold analysis on out-of-sample data ---
y_proba_test = clf_log_optimal.predict_proba(X_test_reduced)[:, 1]  # Probability of synthetic (class 1)

print("\n=== Threshold Analysis (Out-of-Sample) ===")
for threshold in [0.3, 0.4, 0.5, 0.6, 0.7]:
    y_pred_thresh = (y_proba_test >= threshold).astype(int)

    cm = confusion_matrix(y_test, y_pred_thresh, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()

    detection_rate = tp / (tp + fn)  # Recall for synthetic
    false_positive_rate = fp / (fp + tn)  # FPR for real

    print(f"Threshold {threshold}: Detection={detection_rate:.1%}, FPR={false_positive_rate:.1%}")

# --- Confusion matrix for default 0.5 threshold ---
print("\n=== Confusion Matrix (Out-of-Sample) ===")
cm = confusion_matrix(y_test, log_pred, labels=[0, 1])
cm_df = pd.DataFrame(cm,
                     index=['Actual Real (0)', 'Actual Synthetic (1)'],
                     columns=['Predicted Real (0)', 'Predicted Synthetic (1)'])
print(cm_df)

# --- Interpretation ---
print(f"\n=== Concrete Results ===")
print(f"Of {sum(y_test == 0)} human reviews: {cm[0,0]} correctly identified, {cm[0,1]} flagged as bots")
print(f"Of {sum(y_test == 1)} synthetic reviews: {cm[1,1]} caught, {cm[1,0]} slipped through")

In [None]:
# --- Confusion matrix ---
cm_test = confusion_matrix(y_test, log_pred, labels=[0, 1])
disp = ConfusionMatrixDisplay(cm_test, display_labels=['real', 'synthetic'])
fig, ax = plt.subplots(figsize=(4, 4))
disp.plot(ax=ax, cmap='Blues', colorbar=False)
ax.set_title("Out-of-Sample Confusion Matrix", fontfamily='monospace', fontsize=10)
ax.set_xlabel("Predicted Label", fontfamily='monospace', fontsize=10)
ax.set_ylabel("True Label", fontfamily='monospace', fontsize=10)
ax.tick_params(labelsize=8)
for label in ax.get_xticklabels() + ax.get_yticklabels():
    label.set_fontfamily('monospace')

# Clear the default text annotations first
for text in ax.texts:
    text.set_visible(False)

# Add custom annotations with appropriate colors for visibility
for i in range(2):
    for j in range(2):
        count = cm_test[i, j]
        total = cm_test[i].sum()
        percentage = count / total * 100

        # Use white text on dark cells, black text on light cells
        text_color = 'white' if cm_test[i, j] > cm_test.max() / 2 else 'black'

        ax.text(j, i, f'{count}\n({percentage:.1f}%)',
                ha='center', va='center', fontsize=9, fontfamily='monospace',
                color=text_color, weight='bold')

plt.savefig('confusion_matrix_out_of_sample.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# My result: Out of sample prediction finished successfully