In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
def get_features(spectrum_data_file: str, interp_len: int = 500):
    df = pd.read_csv(spectrum_data_file)
    # If `sid` column does not exist, create it
    if 'sid' not in df.columns:
        df['sdiff']  = df['freq'] < df['freq'].shift(1, fill_value=0)
        df['sdiff'] = df['sdiff'].astype(int)
        df['sid'] = df['sdiff'].cumsum()

    features_interp = []
    for _, group in df.groupby('sid'):
        freqs = group['freq'].values
        features = group['power'].values
        new_freq = np.linspace(0, 0.5, interp_len)
        new_feat = np.interp(new_freq, freqs, features)
        features_interp.append(new_feat)

    return np.array(features_interp)

### PubMed

In [48]:
# GPT2-xl
x_orig = get_features('../data/gpt-4/pubmed_gpt-4.original.gpt2xl.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/pubmed_gpt-4.sampled.gpt2xl.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.8        0.78333333 0.8        0.83333333 0.8       ]
0.8033333333333335


In [47]:
# GPT2-lg
x_orig = get_features('../data/gpt-4/pubmed_gpt-4.original.gpt2lg.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/pubmed_gpt-4.sampled.gpt2lg.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.55       0.53333333 0.45       0.46666667 0.51666667]
0.5033333333333333


In [46]:
# GPT2-md
x_orig = get_features('../data/gpt-4/pubmed_gpt-4.original.gpt2md.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/pubmed_gpt-4.sampled.gpt2md.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.46666667 0.51666667 0.51666667 0.55       0.55      ]
0.5199999999999999


In [31]:
# GPT2 
x_orig = get_features('../data/gpt-4/pubmed_gpt-4.original.gpt2.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/pubmed_gpt-4.sampled.gpt2.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.5        0.63333333 0.48333333 0.43333333 0.53333333]
0.5166666666666666


In [17]:
# Mistral
# x_orig = get_features('../data/gpt-4/pubmed_gpt-4.original.mistral.nlllogzs.fftnorm.circlemean.txt')
x_orig = get_features('../data/gpt-4/pubmed_gpt-4.original.mistral.nllzs.fftnorm.txt')
y_orig = np.zeros(x_orig.shape[0])
# x_samp = get_features('../data/gpt-4/pubmed_gpt-4.sampled.mistral.nlllogzs.fftnorm.circlemean.txt')
x_samp = get_features('../data/gpt-4/pubmed_gpt-4.sampled.mistral.nllzs.fftnorm.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.83333333 0.75       0.76666667 0.76666667 0.76666667]
0.7766666666666666


In [49]:
# Llama
x_orig = get_features('../data/gpt-4/pubmed_gpt-4.original.llama.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/pubmed_gpt-4.sampled.llama.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.81666667 0.75       0.78333333 0.78333333 0.75      ]
0.7766666666666666


In [7]:
# Llama-13b
x_orig = get_features('../data/gpt-4/pubmed_gpt-4.original.llama-13b.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/pubmed_gpt-4.sampled.llama-13b.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.75       0.71666667 0.73333333 0.78333333 0.73333333]
0.7433333333333334


In [51]:
# GPT2-md-pubmed
x_orig = get_features('../data/gpt-4/pubmed_gpt-4.original.gpt2md-pubmed.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/pubmed_gpt-4.sampled.gpt2md-pubmed.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.48333333 0.63333333 0.45       0.51666667 0.56666667]
0.53


In [6]:
# GPT2xl, questions shuffled
x_orig = get_features('../data/gpt-4/pubmed_gpt-4.original.questionshuffled.gpt2xl.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/pubmed_gpt-4.sampled.questionshuffled.gpt2xl.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.48333333 0.56666667 0.55       0.55       0.46666667]
0.5233333333333334


### Writing

In [40]:
# GPT2-xl
x_orig = get_features('../data/gpt-4/writing_gpt-4.original.gpt2xl.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/writing_gpt-4.sampled.gpt2xl.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.66666667 0.61666667 0.7        0.7        0.7       ]
0.6766666666666665


In [42]:
# GPT2-lg
x_orig = get_features('../data/gpt-4/writing_gpt-4.original.gpt2lg.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/writing_gpt-4.sampled.gpt2lg.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.66666667 0.51666667 0.63333333 0.6        0.56666667]
0.5966666666666667


In [41]:
# GPT2-md
x_orig = get_features('../data/gpt-4/writing_gpt-4.original.gpt2md.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/writing_gpt-4.sampled.gpt2md.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.53333333 0.48333333 0.6        0.55       0.61666667]
0.5566666666666668


In [8]:
# GPT2 
x_orig = get_features('../data/gpt-4/writing_gpt-4.original.gpt2.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/writing_gpt-4.sampled.gpt2.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.56666667 0.58333333 0.61666667 0.55       0.56666667]
0.5766666666666665


In [9]:
# Mistral
x_orig = get_features('../data/gpt-4/writing_gpt-4.original.mistral.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/writing_gpt-4.sampled.mistral.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.65       0.63333333 0.6        0.65       0.61666667]
0.63


In [50]:
# Llama
x_orig = get_features('../data/gpt-4/writing_gpt-4.original.llama.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/writing_gpt-4.sampled.llama.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.65       0.68333333 0.58333333 0.55       0.6       ]
0.6133333333333334


### Xsum

In [10]:
# GPT2-xl
x_orig = get_features('../data/gpt-4/xsum_gpt-4.original.gpt2xl.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/xsum_gpt-4.sampled.gpt2xl.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.73333333 0.66666667 0.65       0.73333333 0.75      ]
0.7066666666666667


In [11]:
# GPT2 
x_orig = get_features('../data/gpt-4/xsum_gpt-4.original.gpt2.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/xsum_gpt-4.sampled.gpt2.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(),
                      SelectKBest(k=120),
    SVC(gamma='auto', kernel='rbf', C=1))

scores = cross_val_score(model, x, y, cv=5)
print(scores)
print(np.mean(scores))

[0.63333333 0.61666667 0.65       0.61666667 0.63333333]
0.63


In [27]:
# Mistral
x_orig = get_features('../data/gpt-4/xsum_gpt-4.original.mistral.nlllogzs.fftnorm.circlemean.txt')
y_orig = np.zeros(x_orig.shape[0])
x_samp = get_features('../data/gpt-4/xsum_gpt-4.sampled.mistral.nlllogzs.fftnorm.circlemean.txt')
y_samp = np.ones(x_samp.shape[0])

x = np.concatenate([x_orig, x_samp], axis=0)
y = np.concatenate([y_orig, y_samp], axis=0)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

best_k = 1
best_model = None
best_avg_score = 0

for k in tqdm(range(1, 200)):
    model = make_pipeline(StandardScaler(),
                        SelectKBest(k=k),
        SVC(gamma='auto', kernel='rbf', C=1))
    scores = cross_val_score(model, x, y, cv=5)
    avg_score = np.mean(scores)
    if avg_score > best_avg_score:
        best_avg_score = avg_score
        best_k = k
        best_model = model

print('Best k:', best_k)
print('Best avg score:', best_avg_score)

100%|██████████| 199/199 [00:05<00:00, 35.88it/s]

Best k: 93
Best avg score: 0.6533333333333333



