# **Connecting to Dataset on Drive**

In [2]:
from google.colab import drive
import shutil
import os
import sys

drive.mount('/content/drive')
#Copy of data and code
#Copy Data
drive_data = '/content/drive/My Drive/aesthetics_project'
colab_data = '/content/data'
if os.path.exists(drive_data):
    shutil.copytree(drive_data, colab_data, dirs_exist_ok=True)
    print("Copied data from Drive")

#Copy Source Code
drive_src = '/content/drive/My Drive/src'
colab_src = '/content/src'
if os.path.exists(drive_src):
    shutil.copytree(drive_src, colab_src, dirs_exist_ok=True)
    print("Copied src code from Drive")

#Added to Python Path
if '/content/src' not in sys.path:
    sys.path.append('/content/src')
    print("Added /content/src to Python path")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Copied data from Drive
Copied src code from Drive


# **Installing StyloMetrix and spaCy**

In [3]:
print("Installing StyloMetrix and spaCy models...")
!pip install stylo-metrix
!pip install spacy-transformers
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3.tar.gz
print("Installation complete!")

Installing StyloMetrix and spaCy models...
Collecting spacy-transformers
  Using cached spacy_transformers-1.3.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy-transformers)
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Using cached spacy_alignments-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers<4.50.0,>=3.4.0->spacy-transformers)
  Downloading huggingface_hub-0.36.2-py3-none-any.whl.metadata (15 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<4.50.0,>=3.4.0->spacy-transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading spacy_transformers-1.3.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (795 kB)
[2K   [90m━━━━━━━━━━━━━━

In [4]:
#Setup
import pandas as pd
import sys
sys.path.append('/content/src')

# **Extracting Features with StyloMetrix**

In [5]:
from feature_extraction.stylometrix_wrapper import StyloMetrixExtractor

print("Running StyloMetrix feature extraction...")

# Initializing custom OOP wrapper
extractor = StyloMetrixExtractor(lang='en')

# Running the extraction
features_df = extractor.extract(
    input_folder='/content/data/raw',
    output_csv='/content/data/features.csv'
)

print("Feature extraction complete!")

Running StyloMetrix feature extraction...
Initializing StyloMetrix for language 'en'...
Extracting features for 320 documents. This might take a minute...


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 320/320 [08:06<00:00,  1.52s/it]

Feature extraction complete!





In [6]:
# Verification that features were extracted
if os.path.exists('/content/data/features.csv'):
    features_df = pd.read_csv('/content/data/features.csv')
    print(f"Features extracted! Shape: {features_df.shape}")
    print(f"Number of stylometric features: {features_df.shape[1] - 1}")

    # Check of first 5 rows of features
    print("\nFeatures preview:")
    print(features_df.head())
else:
    print("Feature extraction failed")
    sys.exit(1)

Features extracted! Shape: (320, 198)
Number of stylometric features: 197

Features preview:
                    doc_id                                               text  \
0      GPT_description_067  As the sun begins its descent toward the horiz...   
1     Claude_narration_168  # The First Stroke\n\nUnit Seven had been assi...   
2         GPT_dialogue_055  **Title: A Walk in the Park**\n\n**Characters:...   
3        GPT_narration_001  In the quiet town of Verdant Grove, where the ...   
4  LLaMA_argumentation_103  **Argument For the Use of AI in Creative Writi...   

   POS_VERB  POS_NOUN   POS_ADJ   POS_ADV   POS_DET  POS_INTJ  POS_CONJ  \
0  0.165644  0.288344  0.092025  0.049080  0.184049  0.000000  0.042945   
1  0.212329  0.239726  0.061644  0.020548  0.095890  0.006849  0.034247   
2  0.174497  0.194631  0.114094  0.046980  0.100671  0.006711  0.060403   
3  0.221519  0.208861  0.082278  0.012658  0.126582  0.000000  0.056962   
4  0.171975  0.318471  0.133758  0.038217  0.

# **Merging with the Metadata**

In [7]:
import os
import sys
import shutil
import importlib
import pandas as pd

#Due to ghost folder problem, I forced Colab to pull the fresh, updated file from the Drive right before it merges.
drive_file = '/content/drive/My Drive/src/feature_extraction/preprocess.py'
local_file = '/content/src/feature_extraction/preprocess.py'

if os.path.exists(drive_file):
    shutil.copy2(drive_file, local_file)
    print("Pulled updated preprocess.py from Drive")

#Reloading the module so Python sees the new code
import feature_extraction.preprocess
importlib.reload(feature_extraction.preprocess)
from feature_extraction.preprocess import merge_features

print("Merging features with metadata...")

metadata_path = '/content/data/metadata.csv'
features_path = '/content/data/features.csv'

if os.path.exists(metadata_path):
    merged = merge_features(metadata_path, features_path)
    print(f"Merged successfully! Shape: {merged.shape}")

    #Safety: If it's still 0, show us EXACTLY why
    if merged.shape[0] == 0:
        print("\nDEBUG: The IDs still don't match. Let's look at them:")
        meta_df = pd.read_csv(metadata_path)
        feat_df = pd.read_csv(features_path)

        # Check if doc_id exists in metadata
        if 'doc_id' in meta_df.columns:
            print(f"Metadata IDs look like this : {meta_df['doc_id'].head(3).tolist()}")
        else:
            print(f"Metadata columns are        : {meta_df.columns.tolist()}")

        print(f"Feature IDs look like this  : {feat_df['doc_id'].head(3).tolist()}")
else:
    print("metadata.csv not found!")
    sys.exit(1)

Pulled updated preprocess.py from Drive
Merging features with metadata...
Merged successfully! Shape: (320, 201)


# **Saving the Dataset**

In [8]:
merged.to_csv('/content/data/merged_features.csv', index=False)
print("Dataset saved to /content/data/merged_features.csv")

Dataset saved to /content/data/merged_features.csv


# **Quick Overview**

In [9]:
#Quick exploration
print("\n=== Dataset Overview ===")
print(f"Total documents: {len(merged)}")
print(f"Models: {merged['model'].unique()}")
print(f"Genres: {merged['genre'].unique()}")


=== Dataset Overview ===
Total documents: 320
Models: ['GPT' 'LLaMA' 'Claude' 'Mistral']
Genres: ['narration' 'argumentation' 'dialogue' 'description']


In [10]:
#Show feature columns (excluding metadata)
metadata_cols = ['doc_id', 'model', 'genre', 'prompt']
feature_cols = [col for col in merged.columns if col not in metadata_cols]
print(f"\nNumber of stylometric features: {len(feature_cols)}")
print(f"Sample features: {feature_cols[:10]}")


Number of stylometric features: 197
Sample features: ['text', 'POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM']


In [11]:
#Basic statistics
print("\n=== Documents per model ===")
print(merged['model'].value_counts())

print("\n=== Documents per genre ===")
print(merged['genre'].value_counts())


=== Documents per model ===
model
GPT        80
LLaMA      80
Claude     80
Mistral    80
Name: count, dtype: int64

=== Documents per genre ===
genre
narration        80
argumentation    80
dialogue         80
description      80
Name: count, dtype: int64


In [12]:
# Preview of the merged data
print("\n=== First 5 rows of merged data ===")
merged.head()


=== First 5 rows of merged data ===


Unnamed: 0,doc_id,model,genre,prompt,text,POS_VERB,POS_NOUN,POS_ADJ,POS_ADV,POS_DET,...,RE,ASF,ASM,OM,RCI,DMC,OR,QAS,PA,PR
0,GPT_narration_000,GPT,narration,Write a short story about a robot learning to ...,In a small workshop at the edge of a bustling ...,0.20122,0.256098,0.091463,0.02439,0.134146,...,0.0,0.0,0.006098,0.0,0.0,0.0,0.0,0.097561,0.0,0.0
1,GPT_narration_001,GPT,narration,Write a short story about a robot learning to ...,"In the quiet town of Verdant Grove, where the ...",0.221519,0.208861,0.082278,0.012658,0.126582,...,0.0,0.0,0.006329,0.0,0.0,0.006329,0.006329,0.082278,0.006329,0.0
2,GPT_narration_002,GPT,narration,Write a short story about a robot learning to ...,In a small workshop filled with the scent of o...,0.163399,0.261438,0.111111,0.019608,0.124183,...,0.0,0.0,0.0,0.0,0.0,0.006536,0.0,0.078431,0.0,0.0
3,GPT_narration_003,GPT,narration,Write a short story about a robot learning to ...,In a quiet workshop on the outskirts of a bust...,0.208589,0.239264,0.079755,0.02454,0.122699,...,0.0,0.0,0.006135,0.0,0.0,0.0,0.0,0.04908,0.0,0.0
4,GPT_narration_004,GPT,narration,Write a short story about a robot learning to ...,In a small workshop at the edge of a bustling ...,0.229299,0.254777,0.057325,0.019108,0.095541,...,0.0,0.0,0.006369,0.019108,0.0,0.0,0.0,0.057325,0.0,0.0


# **Saving results to Drive**

In [13]:
print("\nSaving results back to Drive...")
#saving merged features
merged.to_csv('/content/drive/My Drive/aesthetics_project/merged_features.csv', index=False)

#copy of for backup
!cp /content/data/features.csv "/content/drive/My Drive/aesthetics_project/features.csv"

print("All files saved to Drive:")
print("   - aesthetics_project/merged_features.csv")
print("   - aesthetics_project/features.csv")
print("   - aesthetics_project/results/ (will contain future plots)")


Saving results back to Drive...
All files saved to Drive:
   - aesthetics_project/merged_features.csv
   - aesthetics_project/features.csv
   - aesthetics_project/results/ (will contain future plots)
