In [1]:
import pandas as pd
import os
from pathlib import Path
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [2]:
# Getting list of 20 different clusters of text

import random

df = pd.read_csv("master_stats.csv")
master_text_list = df['ID'].tolist()
prefixes = [name.replace('.txt', '') for name in master_text_list]

# Getting list of all 500 token chunk filenames

chunks_dir = Path('corpus_chunks')

chunk_names = [f for f in os.listdir(chunks_dir) if os.path.isfile(os.path.join(chunks_dir, f))]


files_by_prefix = {prefix: [] for prefix in prefixes}

# Iterate over all files in the directory and group them by prefix
for file_path in chunks_dir.iterdir():
    if file_path.is_file():
        # Extract the prefix (the part before the last underscore)
        file_name = file_path.name
        parts = file_name.rsplit('_', 1)  # Split from the right at the last underscore
        file_prefix = parts[0] if len(parts) > 1 else file_name  # Get the part before the last underscore

        # If the prefix is one of the predefined prefixes, add the file to the list
        if file_prefix in files_by_prefix:
            files_by_prefix[file_prefix].append(file_path)

## Initialize a list to store the sampled file names
sampled_files_list = []

# Randomly sample 100 files from each prefix group
for prefix, files in files_by_prefix.items():
    # Check if there are at least 100 files to sample
    if len(files) > 100:
        sampled = random.sample(files, 100)
    else:
        sampled = files  # If fewer than 100 files, sample all of them

    # Add the filenames of the sampled files to the list
    sampled_files_list.extend(file.name for file in sampled)


chunks_df = pd.read_csv("master_features_chunks.csv")
chunks_df['author'] = chunks_df['ID'].apply(lambda x: x.split('_')[0].lower())

sample_df = chunks_df.loc[chunks_df['ID'].isin(sampled_files_list)]
sample_df = sample_df.set_index('ID')
sample_df.head()

Unnamed: 0_level_0,nation,gender,category,mean_sen_len,male_pronouns,female_pronouns,TTR,lex_density,VADER_sentiment,concreteness,...,again,other,must,after,go,might,too,through,himself,author
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dickens_3787.txt,British/Irish,male,authentic,29.411765,0.013921,0.0,0.4942,0.535963,0.9962,2.506728,...,0.00232,0.0,0.0,0.0,0.0,0.0,0.00232,0.0,0.006961,dickens
Bronte_119.txt,British/Irish,female,authentic,20.833333,0.004988,0.0,0.633416,0.59601,0.8328,2.606356,...,0.0,0.0,0.0,0.0,0.002494,0.002494,0.0,0.002494,0.002494,bronte
Austen_424.txt,British/Irish,female,authentic,22.727273,0.02445,0.022005,0.418093,0.586797,0.935,2.560488,...,0.0,0.0,0.00489,0.002445,0.007335,0.0,0.002445,0.0,0.0,austen
Alcott_42.txt,American,female,authentic,20.08,0.021792,0.0,0.51816,0.539952,0.9897,2.559505,...,0.0,0.0,0.002421,0.0,0.004843,0.0,0.0,0.0,0.0,alcott
DICKENS_synthetic_combined_146.txt,British/Irish,male,synthetic,23.809524,0.018141,0.0,0.535147,0.562358,0.5106,2.622072,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006803,0.002268,dickens


In [3]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, Dickens_3787.txt to GASKELL_synthetic_combined_182.txt
Columns: 211 entries, nation to author
dtypes: float64(205), int64(2), object(4)
memory usage: 3.2+ MB


In [162]:
sample_df.columns.values

array(['nation', 'gender', 'category', 'mean_sen_len', 'male_pronouns',
       'female_pronouns', 'TTR', 'lex_density', 'VADER_sentiment',
       'concreteness', 'said', 'mr', 'little', 'time', 'like', 'know',
       'man', 'old', 'hand', 'come', 'miss', 'day', 'good', 'eye',
       'thought', 'way', 'think', 'face', 'sir', 'great', 'came', 'thing',
       'long', 'heart', 'away', 'young', 'went', 'look', 'word', 'lady',
       'life', 'dear', 'head', 'room', 'house', 'looked', 'night', 'mind',
       'shall', 'friend', 'tell', 'place', 'woman', 'child', 'took',
       'door', 'let', 'found', 'mother', 'home', 'got', 'gentleman',
       'father', 'saw', 'better', 'love', 'don', 'going', 'knew', 'boy',
       'people', 'right', 'hope', 'moment', 'year', 'world', 'voice',
       'left', 'poor', 'looking', 'asked', 'girl', 'felt', 'sat', 'new',
       'air', 'oh', 'round', 'want', 'having', 'soon', 'heard', 'mean',
       'stood', 'find', 'light', 'men', 'yes', 'told', 'hour', 'street',
 

In [163]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, Twain_2265.txt to ALCOTT_synthetic_combined_192.txt
Columns: 211 entries, nation to author
dtypes: float64(205), int64(2), object(4)
memory usage: 3.2+ MB


In [174]:
X = sample_df.drop(columns=['nation', 'gender', 'category', 'author'])  # Drop the label column to get the features

#Below testing how much the classifier degrades when only using mean_sentence_length as a feature
#X = sample_df.drop(columns=['nation', 'gender', 'category', 'author', 'male_pronouns','female_pronouns', 'TTR', 'lex_density',
       'concreteness', 'said', 'mr', 'little', 'time', 'like', 'know',
       'man', 'old', 'hand', 'come', 'miss', 'day', 'good', 'eye',
       'thought', 'way', 'think', 'face', 'sir', 'great', 'came', 'thing',
       'long', 'heart', 'away', 'young', 'went', 'look', 'word', 'lady',
       'life', 'dear', 'head', 'room', 'house', 'looked', 'night', 'mind',
       'shall', 'friend', 'tell', 'place', 'woman', 'child', 'took',
       'door', 'let', 'found', 'mother', 'home', 'got', 'gentleman',
       'father', 'saw', 'better', 'love', 'don', 'going', 'knew', 'boy',
       'people', 'right', 'hope', 'moment', 'year', 'world', 'voice',
       'left', 'poor', 'looking', 'asked', 'girl', 'felt', 'sat', 'new',
       'air', 'oh', 'round', 'want', 'having', 'soon', 'heard', 'mean',
       'stood', 'find', 'light', 'men', 'yes', 'told', 'hour', 'street',
       'seen', 'sure', 'morning', 'returned', 'manner', 'cried', 'kind',
       'replied', 'work', 'the', 'and', 'of', 'to', 'a', 'i', 'in',
       'that', 'was', 'it', 'he', 'her', 'his', 'you', 'with', 'as',
       'for', 'she', 'had', 'not', 'but', 'at', 'on', 'be', 'is', 'my',
       'him', 'have', 'me', 'so', 'all', 'by', 'this', 'which', 'they',
       'were', 'if', 'from', 'there', 'no', 'would', 'when', "'s", 'one',
       'or', 'an', 'do', 'what', 'we', 'been', 'could', 'up', 'out',
       'very', 'their', 'who', 'them', 'are', 'now', 'more', 'will',
       'your', 'into', 'upon', 'then', 'some', 'did', 'any', 'about',
       'than', 'can', 'down', "n't", 'much', 'such', 'see', 'before',
       'never', 'where', 'well', 'over', 'how', 'am', 'only', 'should',
       'made', 'say', 'its', 'has', 'own', 'here', 'again', 'other',
       'must', 'after', 'go', 'might', 'too', 'through', 'himself'])
y = sample_df['category']  # Target variable
# If 'category' is a string, convert it to numerical labels
y, original_categories = pd.factorize(y)



In [175]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Nearest Centroid model with shrinkage
model = NearestCentroid(shrink_threshold=0.1)  # Adjust shrink_threshold as needed

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

# Print the classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Save precision, recall, and f1 scores as variables
print(f'Weighted Precision: {precision}')
print(f'Weighted Recall: {recall}')
print(f'Weighted F1 Score: {f1}')

report = classification_report(y_test, y_pred)

Accuracy: 0.7175
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.75      0.71       188
           1       0.76      0.69      0.72       212

    accuracy                           0.72       400
   macro avg       0.72      0.72      0.72       400
weighted avg       0.72      0.72      0.72       400

Weighted Precision: 0.7210775700232784
Weighted Recall: 0.7175
Weighted F1 Score: 0.7176677605875917


In [176]:
type(report)

str

In [1]:
# Print the test results in the form 'index, category'
test_results = pd.DataFrame({'index': X_test.index, 'predicted_category': y_pred})


NameError: name 'pd' is not defined

In [178]:
# Convert y_test and y_pred back to the original string categories
y_test_original = pd.Series(original_categories[y_test], index=X_test.index)
y_pred_original = pd.Series(original_categories[y_pred], index=X_test.index)

# Create a DataFrame with the actual and predicted categories
y_test_df = pd.DataFrame({
    'actual_category': y_test_original,
    'predicted_category': y_pred_original
})

# Find the incorrectly labeled rows
incorrectly_labeled = y_test_df[y_test_df['actual_category'] != y_test_df['predicted_category']]
incorrectly_labeled = incorrectly_labeled.sort_values("actual_category")

# Print the incorrectly labeled rows with ID, actual category, and predicted category
print("Incorrectly labeled samples (with ID, actual category, and predicted category):")
print(incorrectly_labeled)



Incorrectly labeled samples (with ID, actual category, and predicted category):
                                   actual_category predicted_category
ID                                                                   
Alcott_145.txt                           authentic          synthetic
Bronte_1040.txt                          authentic          synthetic
Twain_267.txt                            authentic          synthetic
Griggs_320.txt                           authentic          synthetic
Hopkins_170.txt                          authentic          synthetic
...                                            ...                ...
GRIGGS_synthetic_combined_137.txt        synthetic          authentic
BRONTE_synthetic_combined_159.txt        synthetic          authentic
STOKER_synthetic_combined_20.txt         synthetic          authentic
DICKENS_synthetic_combined_119.txt       synthetic          authentic
DICKENS_synthetic_combined_52.txt        synthetic          authentic

[113 rows

In [179]:
authentic_mislabeled = []
synthetic_mislabeled = []

for index, row in incorrectly_labeled.iterrows():
    if row["actual_category"] == "authentic":
        authentic_mislabeled.append(index)
    else:
        synthetic_mislabeled.append(index)

print(authentic_mislabeled)
print(synthetic_mislabeled)
        

['Alcott_145.txt', 'Bronte_1040.txt', 'Twain_267.txt', 'Griggs_320.txt', 'Hopkins_170.txt', 'Austen_1359.txt', 'Griggs_321.txt', 'Alcott_1286.txt', 'Chesnutt_302.txt', 'Alcott_1322.txt', 'Austen_1276.txt', 'Austen_907.txt', 'Chesnutt_587.txt', 'Gaskell_1811.txt', 'Gaskell_1388.txt', 'Twain_205.txt', 'Alcott_905.txt', 'Twain_739.txt', 'Gaskell_584.txt', 'Alcott_1863.txt', 'Dickens_3675.txt', 'Bronte_241.txt', 'Gaskell_2630.txt', 'Chesnutt_445.txt', 'Griggs_96.txt', 'Austen_1028.txt', 'Gaskell_445.txt', 'Dickens_8516.txt', 'Austen_433.txt', 'Twain_332.txt', 'Austen_663.txt', 'Austen_341.txt', 'Hopkins_51.txt', 'Austen_1475.txt', 'Hopkins_35.txt', 'Dickens_2821.txt', 'Dickens_3904.txt', 'Stoker_809.txt', 'Bronte_1350.txt', 'Alcott_1978.txt', 'Austen_563.txt', 'Dickens_3550.txt', 'Chesnutt_609.txt', 'Twain_614.txt', 'Chesnutt_122.txt', 'Bronte_1312.txt', 'Dickens_2975.txt']
['GASKELL_synthetic_combined_126.txt', 'CHESNUTT_synthetic_combined_79.txt', 'ALCOTT_synthetic_combined_78.txt', 'AUS

In [180]:
import numpy as np

centroids = model.centroids_

# Calculate the importance of each feature as the mean absolute difference between centroids
# For binary classification, this is simply the difference between the two centroids
if centroids.shape[0] == 2:  # Binary classification
    feature_importance = np.abs(centroids[0] - centroids[1])
else:
    # For multi-class, calculate the mean absolute difference between all pairs of centroids
    feature_importance = np.mean(np.abs(centroids[:, np.newaxis] - centroids[np.newaxis, :]), axis=(0, 1))

# Create a DataFrame to hold the feature names and their corresponding importance scores
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': feature_importance
})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# Print the feature importance
print("Feature Importance:")
print(feature_importance_df)

Feature Importance:
           feature  importance
0     mean_sen_len    4.565209
1  VADER_sentiment    0.087777


In [181]:
top_10_df = feature_importance_df[:10]
top_10_df.head(10)

top_10_features = dict(zip(top_10_df['feature'], top_10_df['importance']))

print(top_10_features)

{'mean_sen_len': 4.565209352467527, 'VADER_sentiment': 0.08777749548464464}


In [154]:
results_df = pd.DataFrame(columns=['samples', 'accuracy(ave_F1)', 'ave_precision', 'ave_recall', 'authentic_mislabeled', 'synthetic_mislabeled', 'top_10_features'])



In [155]:
results_df.head()

Unnamed: 0,samples,accuracy(ave_F1),ave_precision,ave_recall,authentic_mislabeled,synthetic_mislabeled,top_10_features


In [156]:
results_df.at[0, 'samples'] = sampled_files_list
results_df.at[0, 'accuracy(ave_F1)'] = f1
results_df.at[0, 'ave_precision'] = precision
results_df.at[0, 'ave_recall'] = recall
results_df.at[0, 'authentic_mislabeled'] = authentic_mislabeled
results_df.at[0, 'synthetic_mislabeled'] = synthetic_mislabeled
results_df.at[0, 'top_10_features'] = top_10_features

results_df.head()

Unnamed: 0,samples,accuracy(ave_F1),ave_precision,ave_recall,authentic_mislabeled,synthetic_mislabeled,top_10_features
0,"[ALCOTT_synthetic_combined_132.txt, ALCOTT_syn...",0.715345,0.721987,0.7175,"[Twain_740.txt, Twain_1485.txt, Twain_2581.txt...","[ALCOTT_synthetic_combined_23.txt, HOPKINS_syn...","{'mean_sen_len': 4.857105858499239, 'VADER_sen..."


In [157]:
#Create a new csv and print to first line

#results_df.to_csv('NSC_data.csv', index=False)

In [158]:
# Add the EXISTING CSV

results_df.to_csv('NSC_data.csv', mode='a', header=False, index=False)

In [9]:
a = pd.read_csv("NSC_data.csv")
a.tail()

Unnamed: 0,samples,accuracy(ave_F1),ave_precision,ave_recall,authentic_mislabeled,synthetic_mislabeled,top_10_features
96,"['ALCOTT_synthetic_combined_133.txt', 'ALCOTT_...",0.665,0.665,0.665,"['Twain_2136.txt', 'Alcott_602.txt', 'Dickens_...","['DICKENS_synthetic_combined_57.txt', 'HOPKINS...","{'mean_sen_len': 4.696841728642905, 'VADER_sen..."
97,"['ALCOTT_synthetic_combined_103.txt', 'ALCOTT_...",0.729973,0.730092,0.73,"['Twain_314.txt', 'Bronte_893.txt', 'Dickens_8...","['GRIGGS_synthetic_combined_52.txt', 'TWAIN_sy...","{'mean_sen_len': 4.436319726279997, 'VADER_sen..."
98,"['ALCOTT_synthetic_combined_149.txt', 'ALCOTT_...",0.798593,0.813593,0.8,"['Dickens_280.txt', 'Bronte_868.txt', 'Alcott_...","['BRONTE_synthetic_combined_167.txt', 'DICKENS...","{'mean_sen_len': 4.5814546493369726, 'VADER_se..."
99,"['ALCOTT_synthetic_combined_145.txt', 'ALCOTT_...",0.689907,0.690366,0.69,"['Austen_190.txt', 'Griggs_396.txt', 'Alcott_9...","['CHESNUTT_synthetic_combined_121.txt', 'CHESN...","{'mean_sen_len': 4.370718512007027, 'VADER_sen..."
100,"['ALCOTT_synthetic_combined_102.txt', 'ALCOTT_...",0.759952,0.760512,0.76,"['Alcott_2204.txt', 'Twain_2513.txt', 'Dickens...","['GRIGGS_synthetic_combined_131.txt', 'DICKENS...","{'mean_sen_len': 3.934935121492579, 'VADER_sen..."


In [None]:


# Save the updated DataFrame back to the CSV
