In [3]:
import pandas as pd
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [4]:
# Getting list of 20 different clusters of text

import random

df = pd.read_csv("master_stats.csv")
master_text_list = df['ID'].tolist()
prefixes = [name.replace('.txt', '') for name in master_text_list]

# Getting list of all 500 token chunk filenames

chunks_dir = Path('corpus_chunks')

chunk_names = [f for f in os.listdir(chunks_dir) if os.path.isfile(os.path.join(chunks_dir, f))]


files_by_prefix = {prefix: [] for prefix in prefixes}

# Iterate over all files in the directory and group them by prefix
for file_path in chunks_dir.iterdir():
    if file_path.is_file():
        # Extract the prefix (the part before the last underscore)
        file_name = file_path.name
        parts = file_name.rsplit('_', 1)  # Split from the right at the last underscore
        file_prefix = parts[0] if len(parts) > 1 else file_name  # Get the part before the last underscore

        # If the prefix is one of the predefined prefixes, add the file to the list
        if file_prefix in files_by_prefix:
            files_by_prefix[file_prefix].append(file_path)

## Initialize a list to store the sampled file names
sampled_files_list = []

# Randomly sample 100 files from each prefix group
for prefix, files in files_by_prefix.items():
    # Check if there are at least 100 files to sample
    if len(files) > 100:
        sampled = random.sample(files, 100)
    else:
        sampled = files  # If fewer than 100 files, sample all of them

    # Add the filenames of the sampled files to the list
    sampled_files_list.extend(file.name for file in sampled)


chunks_df = pd.read_csv("master_features_chunks.csv")
chunks_df['author'] = chunks_df['ID'].apply(lambda x: x.split('_')[0].lower())

sample_df = chunks_df.loc[chunks_df['ID'].isin(sampled_files_list)]
sample_df = sample_df.set_index('ID')
sample_df.head()

Unnamed: 0_level_0,nation,gender,category,mean_sen_len,male_pronouns,female_pronouns,TTR,lex_density,VADER_sentiment,concreteness,...,again,other,must,after,go,might,too,through,himself,author
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Stoker_1737.txt,British/Irish,male,authentic,15.71875,0.007692,0.0,0.484615,0.546154,0.9476,2.620188,...,0.007692,0.0,0.0,0.0,0.002564,0.0,0.0,0.0,0.0,stoker
DICKENS_synthetic_combined_144.txt,British/Irish,male,synthetic,27.777778,0.011111,0.0,0.52,0.535556,-0.9794,2.552137,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006667,0.0,dickens
GASKELL_synthetic_combined_140.txt,British/Irish,female,synthetic,27.777778,0.0,0.011111,0.573333,0.577778,0.9901,2.485116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004444,0.0,gaskell
Bronte_1160.txt,British/Irish,female,authentic,18.518519,0.038554,0.0,0.59759,0.583133,0.9686,2.488449,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00241,0.0,bronte
Gaskell_1892.txt,British/Irish,female,authentic,50.2,0.012077,0.021739,0.507246,0.543478,0.9515,2.480466,...,0.0,0.002415,0.0,0.002415,0.004831,0.0,0.0,0.004831,0.002415,gaskell


In [5]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, Stoker_1737.txt to CHESNUTT_synthetic_combined_180.txt
Columns: 211 entries, nation to author
dtypes: float64(205), int64(2), object(4)
memory usage: 3.2+ MB


In [6]:
results_df = pd.DataFrame(columns=['samples', 'accuracy', 'authentic_mislabeled', 'synthetic_mislabeled', 'top_10_features'])

In [7]:
feature_cols = ['mean_sen_len',
       'male_pronouns', 'female_pronouns', 'TTR', 'lex_density',
       'VADER_sentiment', 'concreteness', 'said', 'mr', 'little', 'time',
       'like', 'know', 'man', 'old', 'hand', 'come', 'miss', 'day',
       'good', 'eye', 'thought', 'way', 'think', 'face', 'sir', 'great',
       'came', 'thing', 'long', 'heart', 'away', 'young', 'went', 'look',
       'word', 'lady', 'life', 'dear', 'head', 'room', 'house', 'looked',
       'night', 'mind', 'shall', 'friend', 'tell', 'place', 'woman',
       'child', 'took', 'door', 'let', 'found', 'mother', 'home', 'got',
       'gentleman', 'father', 'saw', 'better', 'love', 'don', 'going',
       'knew', 'boy', 'people', 'right', 'hope', 'moment', 'year',
       'world', 'voice', 'left', 'poor', 'looking', 'asked', 'girl',
       'felt', 'sat', 'new', 'air', 'oh', 'round', 'want', 'having',
       'soon', 'heard', 'mean', 'stood', 'find', 'light', 'men', 'yes',
       'told', 'hour', 'street', 'seen', 'sure', 'morning', 'returned',
       'manner', 'cried', 'kind', 'replied', 'work', 'the', 'and', 'of',
       'to', 'a', 'i', 'in', 'that', 'was', 'it', 'he', 'her', 'his',
       'you', 'with', 'as', 'for', 'she', 'had', 'not', 'but', 'at', 'on',
       'be', 'is', 'my', 'him', 'have', 'me', 'so', 'all', 'by', 'this',
       'which', 'they', 'were', 'if', 'from', 'there', 'no', 'would',
       'when', "'s", 'one', 'or', 'an', 'do', 'what', 'we', 'been',
       'could', 'up', 'out', 'very', 'their', 'who', 'them', 'are', 'now',
       'more', 'will', 'your', 'into', 'upon', 'then', 'some', 'did',
       'any', 'about', 'than', 'can', 'down', "n't", 'much', 'such',
       'see', 'before', 'never', 'where', 'well', 'over', 'how', 'am',
       'only', 'should', 'made', 'say', 'its', 'has', 'own', 'here',
       'again', 'other', 'must', 'after', 'go', 'might', 'too', 'through',
       'himself']
X = sample_df[feature_cols] # Features
y = sample_df.category

In [8]:

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)



In [9]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')



Accuracy: 0.9975


In [10]:
print('Classification Report:')
print(classification_report(y_test, y_pred))



Classification Report:
              precision    recall  f1-score   support

   authentic       1.00      1.00      1.00       204
   synthetic       1.00      0.99      1.00       196

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [11]:
incorrect_indices = y_test != y_pred

# Use these indices to select incorrect rows
incorrectly_labeled = X_test[incorrect_indices].copy()

# Add actual and predicted categories
incorrectly_labeled['actual_category'] = y_test[incorrect_indices]
incorrectly_labeled['predicted_category'] = y_pred[incorrect_indices]

# Print the incorrectly labeled rows sorted by actual category
print("Incorrectly labeled samples (correct/predicted):")
print(incorrectly_labeled[['actual_category', 'predicted_category']].sort_values('actual_category'))


Incorrectly labeled samples (correct/predicted):
                                  actual_category predicted_category
ID                                                                  
AUSTEN_synthetic_combined_160.txt       synthetic          authentic


In [12]:
authentic_mislabeled = []
synthetic_mislabeled = []

for index, row in incorrectly_labeled.iterrows():
    if row["actual_category"] == "authentic":
        authentic_mislabeled.append(index)
    else:
        synthetic_mislabeled.append(index)

print(authentic_mislabeled)
print(synthetic_mislabeled)
        

[]
['AUSTEN_synthetic_combined_160.txt']


In [13]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

# Print the feature importance
print("Feature Importance:")
print(feature_importance)

Feature Importance:
    feature  importance
109      of    0.082202
140   which    0.073789
126     not    0.066537
111       a    0.053919
134    have    0.047496
..      ...         ...
52     door    0.000044
71     year    0.000032
149      's    0.000000
179     n't    0.000000
89     mean    0.000000

[207 rows x 2 columns]


In [14]:
top_10_df = feature_importance[:10]
top_10_df.head(10)

top_10_features = dict(zip(top_10_df['feature'], top_10_df['importance']))

print(top_10_features)

{'of': 0.08220234768085538, 'which': 0.07378894328056752, 'not': 0.06653713966394073, 'a': 0.053918695462191454, 'have': 0.04749570294360368, 'said': 0.045701326461201994, 'the': 0.03955301960829261, 'to': 0.02775982570686983, 'mean_sen_len': 0.02353624917801923, 'had': 0.0228034582600506}


In [15]:
# Printing results to csv

results_df.at[0, 'samples'] = sampled_files_list
results_df.at[0, 'accuracy'] = accuracy
results_df.at[0, 'authentic_mislabeled'] = authentic_mislabeled
results_df.at[0, 'synthetic_mislabeled'] = synthetic_mislabeled
results_df.at[0, 'top_10_features'] = top_10_features

results_df.head()

Unnamed: 0,samples,accuracy,authentic_mislabeled,synthetic_mislabeled,top_10_features
0,"[ALCOTT_synthetic_combined_64.txt, ALCOTT_synt...",0.9975,[],[AUSTEN_synthetic_combined_160.txt],"{'of': 0.08220234768085538, 'which': 0.0737889..."


In [16]:
#results_df.to_csv('random_forest_data.csv', index=False)

In [17]:
results_df.to_csv('random_forest_data.csv', mode='a', header=False, index=False)

In [18]:
a = pd.read_csv("random_forest_data.csv")
a.head()

Unnamed: 0,samples,accuracy,authentic_mislabeled,synthetic_mislabeled,top_10_features
0,"['ALCOTT_synthetic_combined_64.txt', 'ALCOTT_s...",0.9975,[],['AUSTEN_synthetic_combined_160.txt'],"{'of': 0.08220234768085538, 'which': 0.0737889..."
