In [64]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from misc_utils import dataset_filtering
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import textstat
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, classification_report, make_scorer)
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from itertools import product

In [3]:
git_repo_path = '/Users/hunterworssam/Datascience'
gutenberg_repo_path = os.path.join(git_repo_path, 'gutenberg')
gutenberg_analysis_repo = os.path.join(git_repo_path, 'gutenberg_corpus_analysis')

In [4]:
# Load the CSV files
train = pd.read_csv('final_train.csv')
test = pd.read_csv('final_test.csv')
val = pd.read_csv('final_val.csv')

In [5]:
train.shape

(1920, 9)

In [10]:
test.shape

(240, 10)

In [12]:
val.shape

(240, 10)

In [14]:
train.head(5)

Unnamed: 0.1,Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects
0,2439,PG12810,"Uncle Sam's Boys with Pershing's Troops: Or, D...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],78,"{'World War, 1914-1918 -- Juvenile fiction', '..."
1,2446,PG12819,"Dick Prescott's Second Year at West Point: Or,...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],94,{'United States Military Academy -- Juvenile f...
2,25920,PG40605,"The Motor Boat Club at Nantucket; or, The Myst...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],189,"{'Motorboats -- Juvenile fiction', 'Nantucket ..."
3,55435,PG8153,"The Young Engineers in Arizona; or, Laying Tra...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],190,"{'Civil engineers -- Fiction', 'Arizona -- Fic..."
4,32899,PG48863,"The Motor Boat Club off Long Island; or, A Dar...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],85,"{'Motorboats -- Juvenile fiction', 'Long Islan..."


In [16]:
test.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects
0,4,712,PG108,The Return of Sherlock Holmes,"Doyle, Arthur Conan",1859.0,1930.0,['en'],3348,"{'Detective and mystery stories, English', 'Ho..."
1,6,821,PG1101,The Second Part of King Henry the Sixth,"Shakespeare, William",1564.0,1616.0,['en'],131,"{'Historical drama', 'Great Britain -- History..."
2,12,1404,PG11641,Over There: War Scenes on the Western Front,"Bennett, Arnold",1867.0,1931.0,['en'],101,"{'Bennett, Arnold, 1867-1931', 'World War, 191..."
3,14,1525,PG11803,"U.S. Copyright Renewals, 1951 January - June",Library of Congress. Copyright Office,,,['en'],179,{'Copyright -- United States -- Catalogs'}
4,16,1572,PG11846,"U.S. Copyright Renewals, 1972 July - December",Library of Congress. Copyright Office,,,['en'],158,{'Copyright -- United States -- Catalogs'}


In [18]:
# Define a function to apply the word, line and token counts
def enrich_dataframe(df):
    count_path = os.path.join(gutenberg_repo_path, 'data', 'counts')
    text_path = os.path.join(gutenberg_repo_path, 'data', 'text')
    token_path = os.path.join(gutenberg_repo_path, 'data', 'tokens')

    df['word_count'] = df['id'].apply(lambda pid: dataset_filtering.get_word_count(pid, count_path))
    df['unique_word_count'] = df['id'].apply(lambda pid: dataset_filtering.get_unique_word_count(pid, count_path))
    df['line_count'] = df['id'].apply(lambda pid: dataset_filtering.get_line_count(pid, text_path))
    df['token_count'] = df['id'].apply(lambda pid: dataset_filtering.get_token_count(pid, token_path))

    return df

Token count, unique word count and line count were not added because the nltk module was not downloaded properly at the time of the download.

In [21]:
# Apply to all datasets
train = enrich_dataframe(train)
val = enrich_dataframe(val)
test = enrich_dataframe(test)

# Check shapes
print("Train shape:", train.shape)
print("Val shape:", val.shape)
print("Test shape:", test.shape)

Train shape: (1920, 13)
Val shape: (240, 14)
Test shape: (240, 14)


Add text and raw data from gutenberg import to the dataframes.

In [23]:
text_folder = os.path.join(gutenberg_repo_path, 'data', 'text')

# Function to load text for a given Project Gutenberg ID
def load_book_text(pg_id):
    filename = f'{pg_id}_text.txt'
    filepath = os.path.join(text_folder, filename)
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        return None  # or "", depending on preference

# Apply the function to each row
train['text'] = train['id'].apply(load_book_text)
val['text'] = val['id'].apply(load_book_text)
test['text'] = test['id'].apply(load_book_text)

In [24]:
raw_folder = os.path.join(gutenberg_repo_path, 'data', 'raw')

# Function to load text for a given Project Gutenberg ID
def load_book_text_raw(pg_id):
    filename = f'{pg_id}_raw.txt'
    filepath = os.path.join(raw_folder, filename)
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        return None  # or "", depending on preference

# Apply the function to each row
train['raw'] = train['id'].apply(load_book_text_raw)
val['raw'] = val['id'].apply(load_book_text_raw)
test['raw'] = test['id'].apply(load_book_text_raw)

Confirm all rows have text data appended.

In [26]:
print("Missing raw files:", train['raw'].isnull().sum())

Missing raw files: 0


In [27]:
print("Missing text files:", train['text'].isnull().sum())

Missing text files: 0


In [29]:
missing_text_rows = train[train['text'].isnull()]

In [30]:
def remove_first_50_words(text):
    if not isinstance(text, str):
        return text  # Leave NaN or None untouched
    words = text.split()
    return ' '.join(words[50:])  # Remove first 50 words

# Preprocess text column
train['text'] = train['text'].apply(remove_first_50_words)
val['text'] = val['text'].apply(remove_first_50_words)
test['text'] = test['text'].apply(remove_first_50_words)

#### Texts have been loaded in, now we can begin TF-IDF Vectorization.

In [33]:
vectorizer = TfidfVectorizer(
    stop_words='english', # Removes a lot of common english words like it, and, that, is etc. Uses predifined scikit list of common english words.
    sublinear_tf=True, # Uses logarithmic word frequency weighting, reducing the weight of extremely frequent terms & helps prevent domination by larger text files
    max_features=15000, # Consideration for both overfitting and computational requirements.
    ngram_range=(1,3)
)
# Try iterating through 1000, 5000, 10000, 20000 max features (Could reduce overfitting (1, 1)	
# 1,000 – 10,000 (1, 2)	5,000 – 20,000 (2, 3)	5,000 – 15,000 (use SVD)

In [34]:
X_train = vectorizer.fit_transform(train['text'])
X_val = vectorizer.transform(val['text'])
X_test = vectorizer.transform(test['text'])

Create seperate dataframe for TF-IDF Vectorization output.

In [36]:
X_train.shape

(1920, 15000)

In [37]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names[:10])

['00' '000' '000 000' '10' '10 min' '10 min sd' '100' '101' '102' '103']


Add stylometric features to the TF-IDF feature array. Twelve features are added here, all related to punctuation frequency and sentence/word length averages.

In [43]:
# Function to extract stylometric features from a single text
def extract_stylometric_features(text):
    if not isinstance(text, str) or not text.strip():
        return [0] * 12  # updated to match total number of features

    # Use NLTK tokenizers with explicit language call
    words = word_tokenize(text, language='english', preserve_line=True)
    sentences = sent_tokenize(text, language='english')
    chars = list(text)

    word_lengths = [len(w) for w in words if w.isalpha()]
    total_words = len(words)
    unique_words = len(set(w.lower() for w in words if w.isalpha()))
    total_sentences = len(sentences)

    avg_word_length = np.mean(word_lengths) if word_lengths else 0
    avg_sentence_length = total_words / total_sentences if total_sentences else 0
    type_token_ratio = unique_words / total_words if total_words else 0

    punctuation_counts = Counter(c for c in text if c in ".,!?;:-")
    punctuation_freqs = [punctuation_counts[p] / len(text) for p in ".,!?;:-"]

    uppercase_chars = sum(1 for c in text if c.isupper())
    uppercase_ratio = uppercase_chars / len(text)

    digit_ratio = sum(1 for c in text if c.isdigit()) / len(text)

    return [
        avg_word_length,
        avg_sentence_length,
        type_token_ratio,
        *punctuation_freqs,
        uppercase_ratio,
        digit_ratio
    ]

# List of feature names
stylo_feature_names = [
    "avg_word_length",
    "avg_sentence_length",
    "type_token_ratio",
    "period_freq", "comma_freq", "exclam_freq",
    "question_freq", "semicolon_freq", "colon_freq", "dash_freq",
    "uppercase_ratio", "digit_ratio"
]

# Apply to your dataset
def get_stylometric_features(df):
    features = df['text'].apply(extract_stylometric_features)
    return pd.DataFrame(features.tolist(), columns=stylo_feature_names)

In [44]:
# For train set
train_stylo = get_stylometric_features(train)

# For val and test
val_stylo = get_stylometric_features(val)
test_stylo = get_stylometric_features(test)

# Preview
print(train_stylo.head())

   avg_word_length  avg_sentence_length  type_token_ratio  period_freq  \
0         4.250333            15.664977          0.078897     0.010975   
1         4.248404            14.571429          0.081391     0.013303   
2         4.066894            20.021214          0.072704     0.012683   
3         4.139240            20.450896          0.075695     0.012977   
4         4.125742            21.993048          0.069692     0.012717   

   comma_freq  exclam_freq  question_freq  semicolon_freq  colon_freq  \
0    0.012713     0.001262       0.001098        0.000138    0.000208   
1    0.014442     0.001212       0.001377        0.000256    0.000176   
2    0.013631     0.001044       0.001168        0.000244    0.000173   
3    0.013200     0.001576       0.001345        0.000166    0.000083   
4    0.015411     0.000932       0.001211        0.000102    0.000185   

   dash_freq  uppercase_ratio  digit_ratio  
0   0.002203         0.027425     0.000093  
1   0.002391         0.030

Add common readability quantification metrics to the dataset. Our training set will now be 1920 x (TF-IDF value + 12 + 6)

In [46]:
def readability_metrics(text):
    return [
        textstat.flesch_reading_ease(text),
        textstat.flesch_kincaid_grade(text),
        textstat.gunning_fog(text),
        textstat.smog_index(text),
        textstat.coleman_liau_index(text),
        textstat.automated_readability_index(text)
    ]

readability_feature_names = [
    'flesch_reading_ease',
    'flesch_kincaid_grade',
    'gunning_fog',
    'smog_index',
    'coleman_liau_index',
    'automated_readability_index'
]

# Apply to a DataFrame
def get_readability_features(df):
    scores = df['text'].apply(readability_metrics)
    return pd.DataFrame(scores.tolist(), columns=readability_feature_names)

In [47]:
# Get readability feature matrices
train_readability = get_readability_features(train)
val_readability = get_readability_features(val)
test_readability = get_readability_features(test)

In [48]:
train_readability.head()

Unnamed: 0,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,coleman_liau_index,automated_readability_index
0,73.78,6.5,6.68,9.1,7.71,7.7
1,75.4,5.9,6.2,8.6,7.29,6.9
2,74.9,6.1,6.16,8.4,6.77,6.7
3,75.4,5.9,6.0,8.6,6.83,6.6
4,74.59,6.2,6.24,8.3,7.3,7.1


In [49]:
# Horizontally concatenate
train_extra = pd.concat([train_stylo, train_readability], axis=1)
val_extra = pd.concat([val_stylo, val_readability], axis=1)
test_extra = pd.concat([test_stylo, test_readability], axis=1)

### Features with larger numeric ranges dominate the distance calculation within kNN, therefore we must scale our 18 additional features.

In [51]:
# Scale only the stylometric + readability features
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_extra)  # shape: (1920, 18)
val_scaled = scaler.transform(val_extra)
test_scaled = scaler.transform(test_extra)

### Way more features than observations. Let's implement truncated SVD, which is a principal component analysis for spare matrices. We only want to implement this on the TF-IDF features since they are sparse. Note that this step was omitted for both NB and RF modeling.

In [53]:
# Iterate here through various numbers of components (100 - 1000)
# Reduce TF-IDF part only
svd = TruncatedSVD(n_components=300, random_state=42)
X_train_reduced = svd.fit_transform(X_train)
X_val_reduced   = svd.transform(X_val)
X_test_reduced  = svd.transform(X_test)

In [54]:
# Reduces dimensionality in a continuous, information-preserving way. 
# Projects your 10,018-dimensional space into (say) 300 components based on variance and structure in the data. 
# Handles sparsity and correlation automatically, Works great for distance-based or margin-based models like: 
# k-Nearest Neighbors (kNN), Logistic Regression, SVM, Random Forest (can still benefit from lower noise

In [55]:
# Convert extra features to sparse format
train_extra_sparse = csr_matrix(train_scaled)
val_extra_sparse = csr_matrix(val_scaled)
test_extra_sparse = csr_matrix(test_scaled)

X_train_reduced = csr_matrix(X_train_reduced)
X_val_reduced   = csr_matrix(X_val_reduced)
X_test_reduced  = csr_matrix(X_test_reduced)

# Stack with TF-IDF matrices
X_train_combined = hstack([X_train_reduced, train_extra_sparse])
X_val_combined   = hstack([X_val_reduced, val_extra_sparse])
X_test_combined  = hstack([X_test_reduced, test_extra_sparse])

In [56]:
X_train_combined.shape

(1920, 318)

In [57]:
# Define labels from original dataset
y_train = train['author']
y_val = val['author']
y_test = test['author']

In [58]:
def evaluate_model(model, X_train, y_train, X_val, y_val):
    # Train and predict
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    # Metrics (set zero_division=0 to silence warnings)
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted', zero_division=0)
    precision = precision_score(y_val, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_val, y_pred, average='weighted', zero_division=0)

    # Print performance
    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print("-" * 40)

    return model, y_pred

In [59]:
knn = KNeighborsClassifier(
    n_neighbors=5,        # or try others: 3, 7, 10
    metric='euclidean',   # or 'manhattan', 'cosine'
    weights='uniform'     # or 'distance'
)

In [60]:
knn_model, y_pred_knn = evaluate_model(
    model=knn,
    X_train=X_train_combined,
    y_train=y_train,
    X_val=X_val_combined,
    y_val=y_val
)

Model: KNeighborsClassifier
Accuracy:  0.9167
F1 Score:  0.9100
Precision: 0.9273
Recall:    0.9167
----------------------------------------


#### Note the vast amount of tuning possibilities. TF-IDF settings, SVD settings, +/- Stylo features and kNN settings.

In [66]:
# ----- Define Parameters -----
svd_components = [100, 300, 500, 800]  # Try different SVD reductions
k_values = [3, 5, 7, 10]
metrics = ['euclidean', 'cosine']
weights = ['uniform', 'distance']

# ----- Create parameter combinations -----
param_combos = list(product(svd_components, k_values, metrics, weights))

results = []

# ----- Loop over combinations -----
for n_comp, k, dist_metric, wt in param_combos:
    # Step 1: Apply SVD (on TF-IDF part only)
    svd = TruncatedSVD(n_components=n_comp, random_state=42)
    X_train_reduced = svd.fit_transform(X_train)
    X_val_reduced   = svd.transform(X_val)
    X_test_reduced  = svd.transform(X_test)

    # Step 2: Build and train kNN
    knn = KNeighborsClassifier(
        n_neighbors=k,
        metric=dist_metric,
        weights=wt
    )
    knn.fit(X_train_reduced, y_train)
    y_val_pred = knn.predict(X_val_reduced)

    # Step 3: Evaluate
    acc = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred, average='weighted', zero_division=0)
    precision = precision_score(y_val, y_val_pred, average='weighted', zero_division=0)
    recall = recall_score(y_val, y_val_pred, average='weighted', zero_division=0)

    # Step 4: Save results
    results.append({
        'svd_components': n_comp,
        'k_neighbors': k,
        'distance_metric': dist_metric,
        'weights': wt,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    })

# ----- Export to CSV -----
df_knn_results = pd.DataFrame(results)
df_knn_results = df_knn_results.sort_values(by='f1', ascending=False)
df_knn_results.to_csv('knn_results.csv', index=False)

#### Now we can implement the top performing hyperparameter settings.

In [69]:
# 1. Apply Truncated SVD to TF-IDF matrices
svd = TruncatedSVD(n_components=300, random_state=42)
X_train_reduced = svd.fit_transform(X_train)
X_val_reduced   = svd.transform(X_val)
X_test_reduced  = svd.transform(X_test)

# 2. Instantiate kNN with optimal parameters
knn = KNeighborsClassifier(
    n_neighbors=3,
    metric='cosine',
    weights='distance'
)

# 3. Fit model
knn.fit(X_train_reduced, y_train)

# 4. Predict for each split
y_train_pred = knn.predict(X_train_reduced)
y_val_pred   = knn.predict(X_val_reduced)
y_test_pred  = knn.predict(X_test_reduced)

# 5. Define evaluation function
def print_metrics(name, y_true, y_pred):
    print(f"📊 {name} Set Metrics:")
    print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")
    print(f"Recall:    {recall_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")
    print("-" * 40)

# 6. Print metrics
print_metrics("Train", y_train, y_train_pred)
print_metrics("Validation", y_val, y_val_pred)
print_metrics("Test", y_test, y_test_pred)

📊 Train Set Metrics:
Accuracy:  1.0000
F1 Score:  1.0000
Precision: 1.0000
Recall:    1.0000
----------------------------------------
📊 Validation Set Metrics:
Accuracy:  0.9500
F1 Score:  0.9438
Precision: 0.9479
Recall:    0.9500
----------------------------------------
📊 Test Set Metrics:
Accuracy:  0.9375
F1 Score:  0.9344
Precision: 0.9565
Recall:    0.9375
----------------------------------------
