In [169]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier



In [None]:
# Data
df_gpt4_orig = pd.read_csv('./writing_unigram_mincount=3/writing_gpt-4.original.unigram.fftnorm.txt')
df_gpt4_orig

In [None]:
df_gpt4_orig['sdiff']  = df_gpt4_orig['freq'] < df_gpt4_orig['freq'].shift(1, fill_value=0)
df_gpt4_orig['sdiff'] = df_gpt4_orig['sdiff'].astype(int)
df_gpt4_orig['sid'] = df_gpt4_orig['sdiff'].cumsum()
df_gpt4_orig

In [None]:
freqs = []
features = []
for sid, group in df_gpt4_orig.groupby('sid'):
    freqs.append(group['freq'].values)
    features.append(group['power'].values)

print(len(features))
feature_sizes = [len(f) for f in features]
print(feature_sizes[:10]) # features are of different sizes, thus need interpolation
print(max(feature_sizes), min(feature_sizes), np.mean(feature_sizes))

In [None]:
interp_len = 500
new_freq = np.linspace(0, 0.5, interp_len)
features_interp = []
for i in range(len(features)):
    f_interp = np.interp(new_freq, freqs[i], features[i])
    features_interp.append(f_interp)

print(len(features_interp))
print(set([len(f) for f in features_interp]))

In [188]:
def get_features(spec_file: str, interp_len: int = 500):
    df = pd.read_csv(spec_file)
    df['sdiff']  = df['freq'] < df['freq'].shift(1, fill_value=0)
    df['sdiff'] = df['sdiff'].astype(int)
    df['sid'] = df['sdiff'].cumsum()

    features_interp = []
    for sid, group in df.groupby('sid'):
        freqs = group['freq'].values
        features = group['power'].values
        new_freq = np.linspace(0, 0.5, interp_len)
        new_feat = np.interp(new_freq, freqs, features)
        features_interp.append(new_feat)

    return np.array(features_interp)

In [215]:
# x_gpt4_orig = get_features('./writing_unigram_mincount=3/writing_gpt-4.original.unigram.fftnorm.txt')
x_gpt4_orig = get_features('../data/gpt-4/writing_gpt-4.original.gpt2xl.fftnorm.txt')
y_gpt4_orig = np.zeros(x_gpt4_orig.shape[0])
print(x_gpt4_orig.shape, y_gpt4_orig.shape)

# x_gpt4_samp = get_features('./writing_unigram_mincount=3/writing_gpt-4.sampled.unigram.fftnorm.txt')
x_gpt4_sampled = get_features('../data/gpt-4/writing_gpt-4.sampled.gpt2xl.fftnorm.txt')
y_gpt4_samp = np.ones(x_gpt4_samp.shape[0])
print(x_gpt4_samp.shape, y_gpt4_samp.shape)

x_gpt4 = np.concatenate([x_gpt4_orig, x_gpt4_samp], axis=0)
y_gpt4 = np.concatenate([y_gpt4_orig, y_gpt4_samp], axis=0)

(150, 500) (150,)
(150, 500) (150,)


In [200]:
x_gpt3_orig = get_features('./writing_unigram_mincount=3/writing_gpt-3.5-turbo.original.unigram.fftnorm.txt')
y_gpt3_orig = np.zeros(x_gpt3_orig.shape[0])
print(x_gpt3_orig.shape, y_gpt3_orig.shape)

x_gpt3_samp = get_features('./writing_unigram_mincount=3/writing_gpt-3.5-turbo.sampled.unigram.fftnorm.txt')
y_gpt3_samp = np.ones(x_gpt3_samp.shape[0])
print(x_gpt3_samp.shape, y_gpt3_samp.shape)

x_gpt3 = np.concatenate([x_gpt3_orig, x_gpt3_samp], axis=0)
y_gpt3 = np.concatenate([y_gpt3_orig, y_gpt3_samp], axis=0)

(150, 500) (150,)
(150, 500) (150,)


In [205]:
x_davinci_orig = get_features('./writing_unigram_mincount=3/writing_davinci.original.unigram.fftnorm.txt')
y_davinci_orig = np.zeros(x_davinci_orig.shape[0])
print(x_davinci_orig.shape, y_davinci_orig.shape)

x_davinci_samp = get_features('./writing_unigram_mincount=3/writing_davinci.sampled.unigram.fftnorm.txt')
y_davinci_samp = np.ones(x_davinci_samp.shape[0])
print(x_davinci_samp.shape, y_davinci_samp.shape)

x_davinci = np.concatenate([x_davinci_orig, x_davinci_samp], axis=0)
y_davinci = np.concatenate([y_davinci_orig, y_davinci_samp], axis=0)

(150, 500) (150,)
(150, 500) (150,)


In [217]:
x_train, x_test, y_train, y_test = train_test_split(
    x_gpt4, y_gpt4, test_size=0.2, random_state=42)
# x_train, x_test, y_train, y_test = train_test_split(
#     x_davinci, y_davinci, test_size=0.2, random_state=42)

print('train:', x_train.shape, y_train.shape)
print('test:', x_test.shape, y_test.shape)

train: (240, 500) (240,)
test: (60, 500) (60,)


In [218]:
model = make_pipeline(StandardScaler(), 
                      SVC(gamma='auto', kernel='rbf', C=1))
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.7666666666666667


In [219]:
model = make_pipeline(StandardScaler(), GradientBoostingClassifier())
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.6333333333333333


In [221]:
model = make_pipeline(StandardScaler(), 
                      MLPClassifier(random_state=1, hidden_layer_sizes=(100,200), max_iter=1000))
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

# scores = cross_val_score(model, x_gpt4, y_gpt4, cv=5)
# print(scores.mean(), scores.std())

0.6666666666666666
