In [1]:
import pandas as pd

In [2]:
master_df = pd.read_csv("master_stats.csv")

In [3]:
master_df.head()


In [4]:
master_df['ID']

In [39]:
import os
from pathlib import Path
from nltk.tokenize import word_tokenize

# Set up directories
corpus_dir = Path('corpus')
chunks_dir = Path('corpus_chunks')

# Create chunks_dir if it does not exist
chunks_dir.mkdir(exist_ok=True)

# Sample DataFrame - in practice, load it from your source
# master_df = pd.read_csv('path_to_master_df.csv')  # or however you obtain the DataFrame


# Function to read a file and tokenize the text
def read_and_tokenize_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    tokens = word_tokenize(text)
    return tokens

# Function to save a chunk to a file
def save_chunk(tokens, original_filename, chunk_id):
    chunk_text = ' '.join(tokens)
    stripped_filename = file_id.replace('.txt', '')
    chunk_filename = chunks_dir / f"{stripped_filename}_{chunk_id}.txt"
    with open(chunk_filename, 'w', encoding='utf-8') as file:
        file.write(chunk_text)

# Process each file listed in master_df
for file_id in master_df['ID']:
    file_path = corpus_dir / file_id
    
    # Read and tokenize file
    tokens = read_and_tokenize_file(file_path)
    
    # Chunking the tokens into 500-token parts
    chunk_size = 500
    for i in range(0, len(tokens), chunk_size):
        chunk_tokens = tokens[i:i+chunk_size]
        chunk_id = i // chunk_size + 1
        save_chunk(chunk_tokens, file_id, chunk_id)

print("Processing complete.")

In [6]:
!pwd

In [43]:
master_text_list = master_df['ID'].tolist()

In [45]:
master_text_list

In [40]:
# List all file names in the directory
chunk_names = [f for f in os.listdir(chunks_dir) if os.path.isfile(os.path.join(chunks_dir, f))]

In [41]:
chunk_names[:20]

In [46]:
for text in master_text_list:
    stripped_text = text.replace('.txt', '')
    count = 0
    for subtext in chunk_names:
        stripped_subtext = subtext.replace('.txt', '')
        if stripped_text in stripped_subtext:
            count += 1
    print(text, count)

In [47]:
from pathlib import Path
import nltk

# Download the tokenizer models if not already done
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Define the directory containing the files
directory = Path('corpus_chunks')  # Replace with your directory path

# Initialize the list to store filenames with fewer than 500 tokens
shorts = []

# Iterate over each file in the directory
for file_path in directory.iterdir():
    if file_path.is_file():  # Ensure it's a file
        # Read and tokenize the file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = word_tokenize(text)
            num_tokens = len(tokens)
        
        # Check if the number of tokens is fewer than 500
        if num_tokens < 500:
            shorts.append(file_path.name)  # Append the filename to the list

# Print the list of filenames with fewer than 500 tokens
print("Files with fewer than 500 tokens:", shorts)

In [79]:
import random


# Define the folder containing the files
directory = Path('corpus_chunks')  # Replace with your directory path

# Define the list of prefixes (the initial strings before the underscore)
prefixes = [name.replace('.txt', '') for name in master_text_list]

print(prefixes)

In [80]:
files_by_prefix = {prefix: [] for prefix in prefixes}

# Iterate over all files in the directory and group them by prefix
for file_path in directory.iterdir():
    if file_path.is_file():
        # Extract the prefix (the part before the last underscore)
        file_name = file_path.name
        parts = file_name.rsplit('_', 1)  # Split from the right at the last underscore
        file_prefix = parts[0] if len(parts) > 1 else file_name  # Get the part before the last underscore

        # If the prefix is one of the predefined prefixes, add the file to the list
        if file_prefix in files_by_prefix:
            files_by_prefix[file_prefix].append(file_path)

In [81]:
file_by_prefix = [x for x in files_by_prefix if x not in shorts]

In [82]:

# Initialize a list to store the sampled file names
sampled_files_list = []

# Randomly sample 100 files from each prefix group
for prefix, files in files_by_prefix.items():
    # Check if there are at least 100 files to sample
    if len(files) > 100:
        sampled = random.sample(files, 100)
    else:
        sampled = files  # If fewer than 100 files, sample all of them

    # Add the filenames of the sampled files to the list
    sampled_files_list.extend(file.name for file in sampled)

print(sampled_files_list)


In [83]:
len(sampled_files_list)

In [84]:
samples_df = pd.DataFrame(sampled_files_list, columns=['ID'])

In [85]:
samples_df.head()

In [86]:


# Define a function to apply the condition
def check_synthetic(value):
    if 'synthetic' in value:
        return 'synthetic'
    else:
        return 'authentic'

# Apply the function to the 'text' column and create a new column 'status'
samples_df['category'] = samples_df['ID'].apply(check_synthetic)

samples_df.head(-50)

In [62]:
samples_df.to_csv("samples_labels.csv")

In [87]:
with open('content_words.txt', 'r') as file:
    content_words = [line.strip() for line in file]

In [88]:
print(content_words)

In [89]:
with open('top_stops.txt', 'r') as file:
    top_stops = [line.strip() for line in file]


In [90]:
print(top_stops)

In [91]:
# Add empty columns to the DataFrame based on the words list
for word in top_stops:
    samples_df[word] = pd.NA  # Use pd.NA to represent missing values

samples_df.head()

In [92]:
for word in content_words:
    samples_df[word] = pd.NA  # Use pd.NA to represent missing values

samples_df.head()

In [93]:
samples_df.info()

In [None]:
import os
import pandas as pd
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure necessary resources are downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Function to get the WordNet POS tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the folder where the files are located
folder_path = 'corpus_chunks'

# Iterate through the columns, excluding 'ID' and 'category'
for column in samples_df.columns:
    if column not in ['ID', 'category']:
        # Iterate through each row in samples_df
        for index, row in samples_df.iterrows():
            file_id = row['ID']
            file_path = os.path.join(folder_path, file_id)
            
            # Read the content of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                
            # Tokenize the content
            tokens = word_tokenize(content)
            
            # Lemmatize each word in the content
            lemmatized_tokens = [lemmatizer.lemmatize(token.lower(), get_wordnet_pos(token)) for token in tokens]
            
            # Count the occurrences of the lemmatized column name in the lemmatized content
            token_count = lemmatized_tokens.count(lemmatizer.lemmatize(column.lower(), get_wordnet_pos(column)))
            
            # Calculate the total number of tokens
            total_tokens = len(lemmatized_tokens)
            
            # Calculate the relative frequency
            relative_frequency = token_count / total_tokens if total_tokens > 0 else 0
            
            # Store the relative frequency in the original column
            samples_df.at[index, column] = relative_frequency

# Now, samples_df will have the relative frequency directly in the original columns
print(samples_df)

