# Senior Project: AI vs Human Text

## Data Collection

### Currently using my old writing assignments and some random essays. 

## Data Preprocessing

In [1]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl.metadata (2.0 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Downloading lxml-5.1.0-cp38-cp38-win_amd64.whl.metadata (3.6 kB)
Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
   ---------------------------------------- 0.0/239.6 kB ? eta -:--:--
   ----- --------------------------------- 30.7/239.6 kB 640.0 kB/s eta 0:00:01
   ---------------------------------------- 239.6/239.6 kB 2.4 MB/s eta 0:00:00
Downloading lxml-5.1.0-cp38-cp38-win_amd64.whl (3.9 MB)
   ---------------------------------------- 0.0/3.9 MB ? eta -:--:--
   --------------------- ------------------ 2.1/3.9 MB 44.7 MB/s eta 0:00:01
   ---------------------------------------- 3.9/3.9 MB 49.4 MB/s eta 0:00:00
Installing collected packages: lxml, python-docx
Successfully installed lxml-5.1.0 python-docx-1.1.0


## Setup For Metadata Extraction

In [16]:
def get_word_count(text):
    return len(text.split())

def get_sentence_count(text):
    import re
    sentences = re.split(r'[.!?]+', text)
    return len([s for s in sentences if s.strip()])

def get_avg_word_length(text):
    words = text.split()
    if words:
        return sum(len(word) for word in words) / len(words)
    else:
        return 0

def get_paragraph_count(text):
    paragraphs = text.split('\n\n')  # Assuming paragraphs are separated by double newlines
    return len([p for p in paragraphs if p.strip()])

def get_lexical_diversity(text):
    words = text.split()
    unique_words = set(words)
    if words:
        return len(unique_words) / len(words)
    else:
        return 0

In [17]:
from docx import Document
import os
import pandas as pd

def docx_to_text_and_metadata(path, label):
    doc = Document(path)
    full_text = [paragraph.text for paragraph in doc.paragraphs]
    text = '\n'.join(full_text)
    
    # Accessing document properties
    title = doc.core_properties.title
    subject = doc.core_properties.subject
    
    # Calculate additional textual metadata
    word_count = get_word_count(text)
    sentence_count = get_sentence_count(text)
    avg_word_length = get_avg_word_length(text)
    paragraph_count = get_paragraph_count(text)
    lexical_diversity = get_lexical_diversity(text)
    
    return {
        'text': text,
        'title': title,
        'subject': subject,
        'word_count': word_count,
        'sentence_count': sentence_count,
        'avg_word_length': avg_word_length,
        'paragraph_count': paragraph_count,
        'lexical_diversity': lexical_diversity,
        'file_name': os.path.basename(path),  # Get the file name directly from the path
        'label': label  # Include the label in the returned data
    }

def process_essays_directory(docs_dir, label):
    doc_files = [f for f in os.listdir(docs_dir) if f.endswith('.docx')]
    docs_data = []

    for file_name in doc_files:
        file_path = os.path.join(docs_dir, file_name)
        doc_data = docx_to_text_and_metadata(file_path, label)  # Pass the label
        docs_data.append(doc_data)

    return docs_data

# Directories for human-written and AI-generated essays
human_docs_dir = r'\Users\Colin\OneDrive\Desktop\Human Essays SP'
ai_docs_dir = r'\Users\Colin\OneDrive\Desktop\AI Essays SP'

# Process each directory with the appropriate label
human_docs_data = process_essays_directory(human_docs_dir, 'Human')
ai_docs_data = process_essays_directory(ai_docs_dir, 'AI')

# Combine the data from both sources
all_docs_data = human_docs_data + ai_docs_data

# Convert to DataFrame and export to CSV
df_all_docs = pd.DataFrame(all_docs_data)
df_all_docs.to_csv('combined_docs_with_metadata.csv', index=False)
print("Document data exported to 'combined_docs_with_metadata.csv'.")

Document data exported to 'combined_docs_with_metadata.csv'.


## Find Directory where .csv File is Stored

In [9]:
import os

current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

Current working directory: C:\Users\Colin
