In [2]:
import pandas as pd
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
# Getting list of 20 different clusters of text

import random

df = pd.read_csv("master_stats.csv")
master_text_list = df['ID'].tolist()
prefixes = [name.replace('.txt', '') for name in master_text_list]

# Getting list of all 500 token chunk filenames

chunks_dir = Path('corpus_chunks')

chunk_names = [f for f in os.listdir(chunks_dir) if os.path.isfile(os.path.join(chunks_dir, f))]


files_by_prefix = {prefix: [] for prefix in prefixes}

# Iterate over all files in the directory and group them by prefix
for file_path in chunks_dir.iterdir():
    if file_path.is_file():
        # Extract the prefix (the part before the last underscore)
        file_name = file_path.name
        parts = file_name.rsplit('_', 1)  # Split from the right at the last underscore
        file_prefix = parts[0] if len(parts) > 1 else file_name  # Get the part before the last underscore

        # If the prefix is one of the predefined prefixes, add the file to the list
        if file_prefix in files_by_prefix:
            files_by_prefix[file_prefix].append(file_path)

## Initialize a list to store the sampled file names
sampled_files_list = []

# Randomly sample 100 files from each prefix group
for prefix, files in files_by_prefix.items():
    # Check if there are at least 100 files to sample
    if len(files) > 100:
        sampled = random.sample(files, 100)
    else:
        sampled = files  # If fewer than 100 files, sample all of them

    # Add the filenames of the sampled files to the list
    sampled_files_list.extend(file.name for file in sampled)


chunks_df = pd.read_csv("master_features_chunks.csv")
chunks_df['author'] = chunks_df['ID'].apply(lambda x: x.split('_')[0].lower())

sample_df = chunks_df.loc[chunks_df['ID'].isin(sampled_files_list)]
sample_df = sample_df.set_index('ID')
sample_df.head()

Unnamed: 0_level_0,nation,gender,category,mean_sen_len,male_pronouns,female_pronouns,TTR,lex_density,VADER_sentiment,concreteness,...,again,other,must,after,go,might,too,through,himself,author
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Chesnutt_401.txt,American,male,authentic,20.958333,0.01847,0.0,0.583113,0.635884,-0.9533,2.747888,...,0.0,0.0,0.0,0.0,0.002639,0.0,0.002639,0.0,0.0,chesnutt
Austen_952.txt,British/Irish,female,authentic,20.32,0.01292,0.007752,0.503876,0.573643,0.9944,2.461802,...,0.0,0.005168,0.007752,0.005168,0.0,0.0,0.005168,0.0,0.0,austen
Stoker_555.txt,British/Irish,male,authentic,19.230769,0.004902,0.004902,0.512255,0.566176,0.9918,2.45856,...,0.0,0.0,0.002451,0.0,0.0,0.009804,0.007353,0.0,0.0,stoker
GRIGGS_synthetic_combined_168.txt,American,male,synthetic,26.315789,0.0,0.0,0.542222,0.52,-0.5383,2.447906,...,0.0,0.0,0.002222,0.0,0.0,0.0,0.0,0.002222,0.0,griggs
BRONTE_synthetic_combined_58.txt,British/Irish,female,synthetic,25.0,0.0,0.015945,0.537585,0.580866,-0.9674,2.567873,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006834,0.0,bronte


In [4]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, Chesnutt_401.txt to ALCOTT_synthetic_combined_192.txt
Columns: 211 entries, nation to author
dtypes: float64(205), int64(2), object(4)
memory usage: 3.2+ MB


In [28]:
results_df = pd.DataFrame(columns=['samples', 'accuracy', 'authentic_mislabeled', 'synthetic_mislabeled', 'top_10_features'])

In [14]:
feature_cols = ['mean_sen_len',
       'male_pronouns', 'female_pronouns', 'TTR', 'lex_density',
       'VADER_sentiment', 'concreteness', 'said', 'mr', 'little', 'time',
       'like', 'know', 'man', 'old', 'hand', 'come', 'miss', 'day',
       'good', 'eye', 'thought', 'way', 'think', 'face', 'sir', 'great',
       'came', 'thing', 'long', 'heart', 'away', 'young', 'went', 'look',
       'word', 'lady', 'life', 'dear', 'head', 'room', 'house', 'looked',
       'night', 'mind', 'shall', 'friend', 'tell', 'place', 'woman',
       'child', 'took', 'door', 'let', 'found', 'mother', 'home', 'got',
       'gentleman', 'father', 'saw', 'better', 'love', 'don', 'going',
       'knew', 'boy', 'people', 'right', 'hope', 'moment', 'year',
       'world', 'voice', 'left', 'poor', 'looking', 'asked', 'girl',
       'felt', 'sat', 'new', 'air', 'oh', 'round', 'want', 'having',
       'soon', 'heard', 'mean', 'stood', 'find', 'light', 'men', 'yes',
       'told', 'hour', 'street', 'seen', 'sure', 'morning', 'returned',
       'manner', 'cried', 'kind', 'replied', 'work', 'the', 'and', 'of',
       'to', 'a', 'i', 'in', 'that', 'was', 'it', 'he', 'her', 'his',
       'you', 'with', 'as', 'for', 'she', 'had', 'not', 'but', 'at', 'on',
       'be', 'is', 'my', 'him', 'have', 'me', 'so', 'all', 'by', 'this',
       'which', 'they', 'were', 'if', 'from', 'there', 'no', 'would',
       'when', "'s", 'one', 'or', 'an', 'do', 'what', 'we', 'been',
       'could', 'up', 'out', 'very', 'their', 'who', 'them', 'are', 'now',
       'more', 'will', 'your', 'into', 'upon', 'then', 'some', 'did',
       'any', 'about', 'than', 'can', 'down', "n't", 'much', 'such',
       'see', 'before', 'never', 'where', 'well', 'over', 'how', 'am',
       'only', 'should', 'made', 'say', 'its', 'has', 'own', 'here',
       'again', 'other', 'must', 'after', 'go', 'might', 'too', 'through',
       'himself']
X = sample_df[feature_cols] # Features
y = sample_df.category

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = svm.SVC(kernel='linear')

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)


In [15]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')



Accuracy: 0.87


In [16]:
print('Classification Report:')
print(classification_report(y_test, y_pred))



Classification Report:
              precision    recall  f1-score   support

   authentic       0.96      0.79      0.86       211
   synthetic       0.80      0.96      0.88       189

    accuracy                           0.87       400
   macro avg       0.88      0.87      0.87       400
weighted avg       0.88      0.87      0.87       400



In [17]:
incorrect_indices = y_test != y_pred

# Use these indices to select incorrect rows
incorrectly_labeled = X_test[incorrect_indices].copy()

# Add actual and predicted categories
incorrectly_labeled['actual_category'] = y_test[incorrect_indices]
incorrectly_labeled['predicted_category'] = y_pred[incorrect_indices]

# Print the incorrectly labeled rows sorted by actual category
print("Incorrectly labeled samples (correct/predicted):")
print(incorrectly_labeled[['actual_category', 'predicted_category']].sort_values('actual_category'))

Incorrectly labeled samples (correct/predicted):
                                 actual_category predicted_category
ID                                                                 
Griggs_503.txt                         authentic          synthetic
Bronte_729.txt                         authentic          synthetic
Austen_734.txt                         authentic          synthetic
Hopkins_35.txt                         authentic          synthetic
Austen_144.txt                         authentic          synthetic
Twain_1935.txt                         authentic          synthetic
Griggs_395.txt                         authentic          synthetic
Gaskell_2166.txt                       authentic          synthetic
Chesnutt_592.txt                       authentic          synthetic
Twain_2089.txt                         authentic          synthetic
Gaskell_2823.txt                       authentic          synthetic
Chesnutt_8.txt                         authentic          synthetic

In [18]:
authentic_mislabeled = []
synthetic_mislabeled = []

for index, row in incorrectly_labeled.iterrows():
    if row["actual_category"] == "authentic":
        authentic_mislabeled.append(index)
    else:
        synthetic_mislabeled.append(index)

print(authentic_mislabeled)
print(synthetic_mislabeled)

['Stoker_1585.txt', 'Dickens_3308.txt', 'Alcott_2505.txt', 'Alcott_1543.txt', 'Dickens_2498.txt', 'Stoker_188.txt', 'Gaskell_159.txt', 'Chesnutt_397.txt', 'Gaskell_37.txt', 'Dickens_7913.txt', 'Chesnutt_631.txt', 'Griggs_500.txt', 'Alcott_863.txt', 'Griggs_92.txt', 'Bronte_666.txt', 'Bronte_594.txt', 'Bronte_969.txt', 'Gaskell_619.txt', 'Gaskell_2251.txt', 'Bronte_737.txt', 'Alcott_1961.txt', 'Bronte_729.txt', 'Hopkins_35.txt', 'Austen_144.txt', 'Twain_1935.txt', 'Griggs_395.txt', 'Chesnutt_592.txt', 'Twain_2089.txt', 'Gaskell_2823.txt', 'Chesnutt_8.txt', 'Alcott_2714.txt', 'Twain_2739.txt', 'Alcott_205.txt', 'Gaskell_351.txt', 'Austen_552.txt', 'Bronte_1233.txt', 'Twain_316.txt', 'Gaskell_92.txt', 'Stoker_805.txt', 'Dickens_7168.txt', 'Gaskell_2166.txt', 'Austen_357.txt', 'Chesnutt_288.txt', 'Austen_734.txt', 'Griggs_503.txt']
['TWAIN_synthetic_combined_17.txt', 'AUSTEN_synthetic_combined_13.txt', 'TWAIN_synthetic_combined_190.txt', 'TWAIN_synthetic_combined_28.txt', 'TWAIN_synthetic_

In [22]:
feature_importances = abs(model.coef_[0])  # Take the absolute values of the coefficients

# Assuming the features are not named, create a feature list
feature_names = [f'feature_{i}' for i in range(X.shape[1])]

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance scores in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the DataFrame
print(feature_importance_df)

    Feature  Importance
107     the   10.082099
109      of    8.639379
111       a    7.065955
110      to    4.876597
112       i    4.086469
..      ...         ...
85     want    0.005662
97   street    0.002887
18      day    0.001008
149      's    0.000000
179     n't    0.000000

[207 rows x 2 columns]


In [25]:
top_10_df = feature_importance_df[:10]
top_10_df.head(10)

top_10_features = dict(zip(top_10_df['Feature'], top_10_df['Importance']))

print(top_10_features)

{'the': 10.082099153229182, 'of': 8.639379456010932, 'a': 7.0659546201480525, 'to': 4.876597115049602, 'i': 4.086469184523478, 'he': 3.251434358914396, 'male_pronouns': 3.251434358914396, 'her': 3.1799090919003126, 'it': 2.778046705562153, 'had': 2.729513495160517}


In [29]:
# Printing results to csv

results_df.at[0, 'samples'] = sampled_files_list
results_df.at[0, 'accuracy'] = accuracy
results_df.at[0, 'authentic_mislabeled'] = authentic_mislabeled
results_df.at[0, 'synthetic_mislabeled'] = synthetic_mislabeled
results_df.at[0, 'top_10_features'] = top_10_features

results_df.head()

Unnamed: 0,samples,accuracy,authentic_mislabeled,synthetic_mislabeled,top_10_features
0,"[ALCOTT_synthetic_combined_122.txt, ALCOTT_syn...",0.87,"[Stoker_1585.txt, Dickens_3308.txt, Alcott_250...","[TWAIN_synthetic_combined_17.txt, AUSTEN_synth...","{'the': 10.082099153229182, 'of': 8.6393794560..."


In [30]:
results_df.to_csv('SVM_data.csv', mode='a', header=False, index=False)

Unnamed: 0,samples,accuracy,authentic_mislabeled,synthetic_mislabeled,top_10_features
0,"['ALCOTT_synthetic_combined_122.txt', 'ALCOTT_...",0.87,"['Stoker_1585.txt', 'Dickens_3308.txt', 'Alcot...","['TWAIN_synthetic_combined_17.txt', 'AUSTEN_sy...","{'the': 10.082099153229182, 'of': 8.6393794560..."
