In [1]:
import pandas as pd
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report



In [2]:
# Getting list of 20 different clusters of text

import random

df = pd.read_csv("master_stats.csv")
master_text_list = df['ID'].tolist()
prefixes = [name.replace('.txt', '') for name in master_text_list]

# Getting list of all 500 token chunk filenames

chunks_dir = Path('corpus_chunks')

chunk_names = [f for f in os.listdir(chunks_dir) if os.path.isfile(os.path.join(chunks_dir, f))]


files_by_prefix = {prefix: [] for prefix in prefixes}

# Iterate over all files in the directory and group them by prefix
for file_path in chunks_dir.iterdir():
    if file_path.is_file():
        # Extract the prefix (the part before the last underscore)
        file_name = file_path.name
        parts = file_name.rsplit('_', 1)  # Split from the right at the last underscore
        file_prefix = parts[0] if len(parts) > 1 else file_name  # Get the part before the last underscore

        # If the prefix is one of the predefined prefixes, add the file to the list
        if file_prefix in files_by_prefix:
            files_by_prefix[file_prefix].append(file_path)

## Initialize a list to store the sampled file names
sampled_files_list = []

# Randomly sample 100 files from each prefix group
for prefix, files in files_by_prefix.items():
    # Check if there are at least 100 files to sample
    if len(files) > 100:
        sampled = random.sample(files, 100)
    else:
        sampled = files  # If fewer than 100 files, sample all of them

    # Add the filenames of the sampled files to the list
    sampled_files_list.extend(file.name for file in sampled)


chunks_df = pd.read_csv("master_features_chunks.csv")
chunks_df['author'] = chunks_df['ID'].apply(lambda x: x.split('_')[0].lower())

sample_df = chunks_df.loc[chunks_df['ID'].isin(sampled_files_list)]
sample_df = sample_df.set_index('ID')
sample_df.head()

Unnamed: 0_level_0,nation,gender,category,mean_sen_len,male_pronouns,female_pronouns,TTR,lex_density,VADER_sentiment,concreteness,...,again,other,must,after,go,might,too,through,himself,author
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Twain_2265.txt,American,male,authentic,11.409091,0.009554,0.0,0.522293,0.547771,0.9815,2.707865,...,0.006369,0.003185,0.0,0.0,0.006369,0.0,0.0,0.0,0.006369,twain
GASKELL_synthetic_combined_140.txt,British/Irish,female,synthetic,27.777778,0.0,0.011111,0.573333,0.577778,0.9901,2.485116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004444,0.0,gaskell
GRIGGS_synthetic_combined_168.txt,American,male,synthetic,26.315789,0.0,0.0,0.542222,0.52,-0.5383,2.447906,...,0.0,0.0,0.002222,0.0,0.0,0.0,0.0,0.002222,0.0,griggs
Austen_613.txt,British/Irish,female,authentic,33.4,0.0,0.057554,0.501199,0.568345,-0.9435,2.231135,...,0.0,0.0,0.009592,0.0,0.0,0.002398,0.002398,0.0,0.0,austen
CHESNUTT_synthetic_combined_59.txt,American,male,synthetic,31.25,0.0,0.015217,0.530435,0.556522,0.9767,2.615499,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,chesnutt


In [3]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, Twain_2265.txt to GASKELL_synthetic_combined_182.txt
Columns: 211 entries, nation to author
dtypes: float64(205), int64(2), object(4)
memory usage: 3.2+ MB


In [79]:
X = sample_df.drop(columns=['nation', 'gender', 'category', 'author'])  # Drop the label column to get the features
y = sample_df['category']  # Target variable

# If 'category' is a string, convert it to numerical labels
y, original_categories = pd.factorize(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

# Print the classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Save precision, recall, and f1 scores as variables
print(f'Weighted Precision: {precision}')
print(f'Weighted Recall: {recall}')
print(f'Weighted F1 Score: {f1}')

report = classification_report(y_test, y_pred)

Accuracy: 0.844
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       513
           1       0.86      0.81      0.84       487

    accuracy                           0.84      1000
   macro avg       0.85      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000

Weighted Precision: 0.8448574132466706
Weighted Recall: 0.844
Weighted F1 Score: 0.8437634381857497


In [80]:
print(pd.factorize(y))

(array([0, 0, 0, ..., 0, 1, 0]), array([0, 1]))


## 0 = Synthetic (larger negative co-efficients indicate likilhood of synthetic)
## 1 = Authentic (larger positive co-efficients indicate liklihood of authentic)

In [81]:
# Convert y_test and y_pred back to the original string categories
y_test_original = pd.Series(original_categories[y_test], index=X_test.index)
y_pred_original = pd.Series(original_categories[y_pred], index=X_test.index)

# Create a DataFrame with the actual and predicted categories
y_test_df = pd.DataFrame({
    'actual_category': y_test_original,
    'predicted_category': y_pred_original
})

# Find the incorrectly labeled rows
incorrectly_labeled = y_test_df[y_test_df['actual_category'] != y_test_df['predicted_category']]
incorrectly_labeled = incorrectly_labeled.sort_values("actual_category")

# Print the incorrectly labeled rows with ID, actual category, and predicted category
print("Incorrectly labeled samples (with ID, actual category, and predicted category):")
print(incorrectly_labeled)



Incorrectly labeled samples (with ID, actual category, and predicted category):
                                   actual_category predicted_category
ID                                                                   
Hopkins_81.txt                           authentic          synthetic
Twain_2573.txt                           authentic          synthetic
Austen_1105.txt                          authentic          synthetic
Gaskell_644.txt                          authentic          synthetic
Chesnutt_565.txt                         authentic          synthetic
...                                            ...                ...
TWAIN_synthetic_combined_69.txt          synthetic          authentic
TWAIN_synthetic_combined_123.txt         synthetic          authentic
TWAIN_synthetic_combined_146.txt         synthetic          authentic
HOPKINS_synthetic_combined_73.txt        synthetic          authentic
DICKENS_synthetic_combined_139.txt       synthetic          authentic

[156 rows

In [95]:
incorrectly_labeled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 156 entries, Hopkins_81.txt to DICKENS_synthetic_combined_139.txt
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   actual_category     156 non-null    object
 1   predicted_category  156 non-null    object
dtypes: object(2)
memory usage: 3.7+ KB


In [82]:
authentic_mislabeled = []
synthetic_mislabeled = []

for index, row in incorrectly_labeled.iterrows():
    if row["actual_category"] == "authentic":
        authentic_mislabeled.append(index)
    else:
        synthetic_mislabeled.append(index)

print(authentic_mislabeled)
print(synthetic_mislabeled)

['Hopkins_81.txt', 'Twain_2573.txt', 'Austen_1105.txt', 'Gaskell_644.txt', 'Chesnutt_565.txt', 'Austen_674.txt', 'Chesnutt_89.txt', 'Griggs_307.txt', 'Gaskell_2386.txt', 'Alcott_277.txt', 'Twain_1036.txt', 'Twain_1677.txt', 'Twain_1278.txt', 'Chesnutt_593.txt', 'Twain_562.txt', 'Alcott_1216.txt', 'Gaskell_688.txt', 'Austen_610.txt', 'Austen_986.txt', 'Bronte_647.txt', 'Alcott_439.txt', 'Gaskell_496.txt', 'Bronte_1017.txt', 'Alcott_2535.txt', 'Austen_830.txt', 'Austen_1088.txt', 'Griggs_215.txt', 'Alcott_1547.txt', 'Alcott_1757.txt', 'Alcott_2251.txt', 'Austen_413.txt', 'Alcott_457.txt', 'Stoker_790.txt', 'Gaskell_2245.txt', 'Bronte_676.txt', 'Chesnutt_365.txt', 'Twain_2525.txt', 'Griggs_236.txt', 'Alcott_1486.txt', 'Alcott_2253.txt', 'Griggs_92.txt', 'Dickens_2498.txt', 'Hopkins_97.txt', 'Austen_1524.txt', 'Dickens_1719.txt', 'Bronte_1472.txt', 'Twain_1668.txt', 'Alcott_2247.txt', 'Austen_727.txt', 'Twain_2422.txt', 'Austen_1572.txt', 'Gaskell_466.txt', 'Dickens_3519.txt', 'Alcott_1232

In [83]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.coef_[0]  # For binary classification; use model.coef_ for multi-class
})

# Sort features by importance

feature_importance = feature_importance.reindex(feature_importance['importance'].abs().sort_values(ascending=False).index)

print("Feature Importance (sorted by absolute values):")
print("Postive values indicate positive correlation between feature and prediction, negative values a negative correlation. ")
print(feature_importance)


Feature Importance (sorted by absolute values):
Postive values indicate positive correlation between feature and prediction, negative values a negative correlation. 
    feature  importance
107     the   -5.577709
109      of   -4.685133
111       a   -2.900870
110      to    2.503878
112       i    2.010420
..      ...         ...
85     want    0.001875
97   street   -0.001853
49    woman   -0.000394
149      's    0.000000
179     n't    0.000000

[207 rows x 2 columns]


In [84]:
top_10_dict = dict(list(zip(feature_importance['feature'], feature_importance['importance']))[:10])
top_10_pos = (feature_importance[feature_importance['importance'] > 0].nlargest(10, 'importance').set_index('feature')['importance'].to_dict())
top_10_neg = (feature_importance[feature_importance['importance'] < 0].nsmallest(10, 'importance').set_index('feature')['importance'].to_dict())




In [85]:
top_10_neg

{'the': -5.577708606864234,
 'of': -4.685133375653078,
 'a': -2.900870165189069,
 'her': -1.279184979307204,
 'in': -1.2611651977069163,
 'lex_density': -1.1810828331368088,
 'TTR': -1.0615895885585094,
 'with': -0.9621009202387746,
 'concreteness': -0.8947561607370739,
 'like': -0.7402523051266985}

In [86]:
results_df = pd.DataFrame(columns=['samples', 'accuracy(ave_F1)', 'ave_precision', 'ave_recall', 'authentic_mislabeled', 'synthetic_mislabeled', 'top_10_features', 'top_10_pos_features', 'top_10_neg_features'])


results_df.at[0, 'samples'] = sampled_files_list
results_df.at[0, 'accuracy(ave_F1)'] = f1
results_df.at[0, 'ave_precision'] = precision
results_df.at[0, 'ave_recall'] = recall
results_df.at[0, 'authentic_mislabeled'] = authentic_mislabeled
results_df.at[0, 'synthetic_mislabeled'] = synthetic_mislabeled
results_df.at[0, 'top_10_features'] = top_10_dict
results_df.at[0, 'top_10_pos_features'] = top_10_pos
results_df.at[0, 'top_10_neg_features'] = top_10_neg

results_df.head()

Unnamed: 0,samples,accuracy(ave_F1),ave_precision,ave_recall,authentic_mislabeled,synthetic_mislabeled,top_10_features,top_10_pos_features,top_10_neg_features
0,"[ALCOTT_synthetic_combined_131.txt, ALCOTT_syn...",0.843763,0.844857,0.844,"[Hopkins_81.txt, Twain_2573.txt, Austen_1105.t...","[ALCOTT_synthetic_combined_61.txt, TWAIN_synth...","{'the': -5.577708606864234, 'of': -4.685133375...","{'to': 2.503877735269993, 'i': 2.0104201259329...","{'the': -5.577708606864234, 'of': -4.685133375..."


In [87]:
print(len(results_df.loc[0,'top_10_features']))
print(len(results_df.loc[0,'top_10_pos_features']))
print(len(results_df.loc[0,'top_10_neg_features']))

10
10
10


In [88]:
print(results_df.loc[0,'top_10_features'])

{'the': -5.577708606864234, 'of': -4.685133375653078, 'a': -2.900870165189069, 'to': 2.503877735269993, 'i': 2.0104201259329555, 'he': 1.6899772355675153, 'male_pronouns': 1.6899772355675153, 'it': 1.3596891282174441, 'her': -1.279184979307204, 'was': 1.2767693969622573}


In [89]:
#results_df.to_csv("log_regression.csv", index=False)
results_df.to_csv('log_regression.csv', mode='a', header=False, index=False)

In [90]:
a = pd.read_csv("log_regression.csv")
a.head()

Unnamed: 0,samples,accuracy(ave_F1),ave_precision,ave_recall,authentic_mislabeled,synthetic_mislabeled,top_10_features,top_10_pos_features,top_10_neg_features
0,"['ALCOTT_synthetic_combined_34.txt', 'ALCOTT_s...",0.829459,0.832166,0.83,"['Griggs_131.txt', 'Chesnutt_273.txt', 'Austen...","['ALCOTT_synthetic_combined_155.txt', 'AUSTEN_...","{'the': 5.5614976701593335, 'of': 4.4376037893...","{'the': 5.5614976701593335, 'of': 4.4376037893...","{'to': -2.3174725017573636, 'i': -1.9932841112..."
1,"['ALCOTT_synthetic_combined_131.txt', 'ALCOTT_...",0.843763,0.844857,0.844,"['Hopkins_81.txt', 'Twain_2573.txt', 'Austen_1...","['ALCOTT_synthetic_combined_61.txt', 'TWAIN_sy...","{'the': -5.577708606864234, 'of': -4.685133375...","{'to': 2.503877735269993, 'i': 2.0104201259329...","{'the': -5.577708606864234, 'of': -4.685133375..."


In [91]:
type(a.loc[1,'top_10_pos_features'])

str

In [92]:
print(a.loc[1,'top_10_pos_features'])

{'to': 2.503877735269993, 'i': 2.0104201259329555, 'he': 1.6899772355675153, 'male_pronouns': 1.6899772355675153, 'it': 1.3596891282174441, 'was': 1.2767693969622573, 'you': 1.1769141067446196, 'had': 1.07543058230499, 'not': 0.9568075606441202, 'have': 0.7723651466860788}


In [93]:
import ast

x = ast.literal_eval(a.loc[1, 'top_10_pos_features'])

len(x)

10

In [94]:
print(len(ast.literal_eval(a.loc[1,'top_10_features'])))
print(len(ast.literal_eval(a.loc[1,'top_10_pos_features'])))
print(len(ast.literal_eval(a.loc[1,'top_10_neg_features'])))
print(a.loc[1,'top_10_features'])

10
10
10
{'the': -5.577708606864234, 'of': -4.685133375653078, 'a': -2.900870165189069, 'to': 2.503877735269993, 'i': 2.0104201259329555, 'he': 1.6899772355675153, 'male_pronouns': 1.6899772355675153, 'it': 1.3596891282174441, 'her': -1.279184979307204, 'was': 1.2767693969622573}
