In [None]:
## imports 

import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import treebank
from nltk import pos_tag
import string
import warnings
from collections import Counter
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn.functional as F

from nltk.help import upenn_tagset

nltk.download('punkt')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('treebank')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [5]:
# Loading in corpus csv

master_df = pd.read_csv("data/whole_corpus.csv", index_col=0)

# Mean sentence length #

In [6]:
# Prepping nltk punctuation list 

punc_list = list(string.punctuation)
punc_list.append('``')
punc_list.append('"')
punc_list.append("''")




['_Chapter I_   JEAN MUIR  "Has she come?"', '"No, Mamma, not yet."', '"I wish it were well over.', 'The thought of it worries and excites me.', 'A cushion for my back, Bella."', 'And poor, peevish Mrs. Coventry sank into an easy chair with a nervous sigh and the air of a martyr, while her pretty daughter hovered about her with affectionate solicitude.', '"Who are they talking of, Lucia?"', 'asked the languid young man lounging on a couch near his cousin, who bent over her tapestry work with a happy smile on her usually haughty face.', '"The new governess, Miss Muir.', 'Shall I tell you about her?"', '"No, thank you.', 'I have an inveterate aversion to the whole tribe.', 'I\'ve often thanked heaven that I had but one sister, and she a spoiled child, so that I have escaped the infliction of a governess so long."', '"How will you bear it now?"']


In [7]:
#Calculating mean sentence length

master_df["mean_sen_len"] = None

for index, row in master_df.iterrows():
    text = row["text"]
    sentences = sent_tokenize(text)
    no_sentences = len(sentences)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in punc_list]
    word_count = len(tokens)
    mean_len = word_count/no_sentences
    master_df.at[index, "mean_sen_len"] = mean_len

    if (index + 1) % 10000 == 0:
        print(f"Processed {index + 1} sentences out of {len(master_df)}")
    

Processed 10000 sentences out of 86882
Processed 20000 sentences out of 86882
Processed 30000 sentences out of 86882
Processed 40000 sentences out of 86882
Processed 50000 sentences out of 86882
Processed 60000 sentences out of 86882
Processed 70000 sentences out of 86882
Processed 80000 sentences out of 86882


Unnamed: 0,id,text,category,author,book/prompt,model,nation,gender,race,mean_sen_len
0,1,"_Chapter I_ JEAN MUIR ""Has she come?"" ""No, ...",authentic,alcott,behind a mask,authentic,American,female,white,10.785714
1,2,"asked Lucia. ""Leave the house while she is in ...",authentic,alcott,behind a mask,authentic,American,female,white,16.0
2,3,"She is a nice person, I dare say, and when onc...",authentic,alcott,behind a mask,authentic,American,female,white,14.6


# Sentiment Scores #

In [18]:
# Downloading model for sentiment scores and creating function for sentiment analysis

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

def sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=-1)
    score = probabilities[0][1] - probabilities[0][0]
    return score.item()


In [19]:
# Calculating sentiment scores

batch_size = 1000
master_df["sentiment"] = None

# Loop over the dataframe in batches
for start in range(0, len(master_df), batch_size):
    end = min(start + batch_size, len(master_df))
    batch = master_df.iloc[start:end]  # Select a batch of rows

    # Process each row in the batch
    for index, row in batch.iterrows():
        text = row["text"]
        score = sentiment_score(text)
        master_df.at[index, "sentiment"] = score

    # Print progress update after each batch
    print(f"Processed {end} sentences out of {len(master_df)}")
    

Processed 1000 sentences out of 86882
Processed 2000 sentences out of 86882
Processed 3000 sentences out of 86882
Processed 4000 sentences out of 86882
Processed 5000 sentences out of 86882
Processed 6000 sentences out of 86882
Processed 7000 sentences out of 86882
Processed 8000 sentences out of 86882
Processed 9000 sentences out of 86882
Processed 10000 sentences out of 86882
Processed 11000 sentences out of 86882
Processed 12000 sentences out of 86882
Processed 13000 sentences out of 86882
Processed 14000 sentences out of 86882
Processed 15000 sentences out of 86882
Processed 16000 sentences out of 86882
Processed 17000 sentences out of 86882
Processed 18000 sentences out of 86882
Processed 19000 sentences out of 86882
Processed 20000 sentences out of 86882
Processed 21000 sentences out of 86882
Processed 22000 sentences out of 86882
Processed 23000 sentences out of 86882
Processed 24000 sentences out of 86882
Processed 25000 sentences out of 86882
Processed 26000 sentences out of 8

# Gendered Pronouns #

In [20]:
# Relative frequency male and female pronouns

master_df["male_pronouns"] = 0
master_df["female_pronouns"] = 0

male_pronouns = ["him", "his", "he", "himself"]
female_pronouns = ["her", "hers", "she", "herself"]

batch_size = 10000

# Process in batches
for start in range(0, len(master_df), batch_size):
    end = min(start + batch_size, len(master_df))
    batch = master_df.iloc[start:end]

    # Tokenize, lowercase and remove non alphabetic tokens
    for index, row in batch.iterrows():
        text = row["text"]
        tokens = word_tokenize(text)
        tokens = [token.lower() for token in tokens if token.isalpha()]  # Lowercase and filter for words
        
        # Calculate male pronoun relative frequency
        male_count = sum(tokens.count(pronoun) for pronoun in male_pronouns)
        master_df.at[index, "male_pronouns"] = male_count / len(tokens) if tokens else 0
        
        # Calculate female pronoun relative frequency
        female_count = sum(tokens.count(pronoun) for pronoun in female_pronouns)
        master_df.at[index, "female_pronouns"] = female_count / len(tokens) if tokens else 0

    # Print progress update
    print(f"Processed {end} sentences out of {len(master_df)}")




Processed 10000 sentences out of 86882
Processed 20000 sentences out of 86882
Processed 30000 sentences out of 86882
Processed 40000 sentences out of 86882
Processed 50000 sentences out of 86882
Processed 60000 sentences out of 86882
Processed 70000 sentences out of 86882
Processed 80000 sentences out of 86882
Processed 86882 sentences out of 86882


Unnamed: 0,id,text,category,author,book/prompt,model,nation,gender,race,mean_sen_len,sentiment,male_pronouns,female_pronouns
0,1,"_Chapter I_ JEAN MUIR ""Has she come?"" ""No, ...",authentic,alcott,behind a mask,authentic,American,female,white,10.785714,0.913496,0.006803,0.047619
1,2,"asked Lucia. ""Leave the house while she is in ...",authentic,alcott,behind a mask,authentic,American,female,white,16.0,0.973164,0.016529,0.057851
2,3,"She is a nice person, I dare say, and when onc...",authentic,alcott,behind a mask,authentic,American,female,white,14.6,-0.986233,0.0,0.071429


# Type-Token Ratio #

In [21]:
# Calculating type-token ratio

# Initialize the TTR column
master_df["TTR"] = 0

batch_size = 10000

# Process in batches
for start in range(0, len(master_df), batch_size):
    end = min(start + batch_size, len(master_df))
    batch = master_df.iloc[start:end]
    
    for index, row in batch.iterrows():
        text = row["text"]
        tokens = word_tokenize(text)
        tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words
        
        types = set(tokens)
        total_tokens = len(tokens)
        ttr = len(types) / total_tokens if total_tokens > 0 else 0  # Avoid division by zero
        master_df.at[index, "TTR"] = ttr

    # Print progress update after each batch
    print(f"Processed {end} sentences out of {len(master_df)}")


Processed 10000 sentences out of 86882
Processed 20000 sentences out of 86882
Processed 30000 sentences out of 86882
Processed 40000 sentences out of 86882
Processed 50000 sentences out of 86882
Processed 60000 sentences out of 86882
Processed 70000 sentences out of 86882
Processed 80000 sentences out of 86882
Processed 86882 sentences out of 86882


Unnamed: 0,id,text,category,author,book/prompt,model,nation,gender,race,mean_sen_len,sentiment,male_pronouns,female_pronouns,TTR
0,1,"_Chapter I_ JEAN MUIR ""Has she come?"" ""No, ...",authentic,alcott,behind a mask,authentic,American,female,white,10.785714,0.913496,0.006803,0.047619,0.70068
1,2,"asked Lucia. ""Leave the house while she is in ...",authentic,alcott,behind a mask,authentic,American,female,white,16.0,0.973164,0.016529,0.057851,0.710744
2,3,"She is a nice person, I dare say, and when onc...",authentic,alcott,behind a mask,authentic,American,female,white,14.6,-0.986233,0.0,0.071429,0.635714


# Lexical Density #

In [23]:
# Calculating lexical density

master_df["lex_density"] = 0



for index, row in master_df.iterrows():
    text = row["text"]
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words
    pos_tags = pos_tag(tokens)
    lexical_pos = {'NN', 'NNS', 'NNP', 'NNPS',  # Nouns
                   'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',  # Verbs
                   'JJ', 'JJR', 'JJS',  # Adjectives
                   'RB', 'RBR', 'RBS'}  # Adverbs
    
    lexical_words = [word for word, pos in pos_tags if pos in lexical_pos]
    total_words = len(tokens)
    lexical_density = len(lexical_words) / total_words
    master_df.at[index, "lex_density"] = lexical_density

    if (index + 1) % 10000 == 0:
        print(f"Processed {index + 1} sentences out of {len(master_df)}")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/claudiac/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Processed 10000 sentences out of 86882
Processed 20000 sentences out of 86882
Processed 30000 sentences out of 86882
Processed 40000 sentences out of 86882
Processed 50000 sentences out of 86882
Processed 60000 sentences out of 86882
Processed 70000 sentences out of 86882
Processed 80000 sentences out of 86882


Unnamed: 0,id,text,category,author,book/prompt,model,nation,gender,race,mean_sen_len,sentiment,male_pronouns,female_pronouns,TTR,lex_density
0,1,"_Chapter I_ JEAN MUIR ""Has she come?"" ""No, ...",authentic,alcott,behind a mask,authentic,American,female,white,10.785714,0.913496,0.006803,0.047619,0.70068,0.544218
1,2,"asked Lucia. ""Leave the house while she is in ...",authentic,alcott,behind a mask,authentic,American,female,white,16.0,0.973164,0.016529,0.057851,0.710744,0.561983
2,3,"She is a nice person, I dare say, and when onc...",authentic,alcott,behind a mask,authentic,American,female,white,14.6,-0.986233,0.0,0.071429,0.635714,0.6


# Relative Frequency of High Frequency Function Words #

In [8]:
top_stops = open("data/top_stops_new.txt").read().splitlines()
top_stops

['its',
 'during',
 'between',
 'how',
 'see',
 'be',
 "isn't",
 'off',
 'must',
 'but',
 "couldn't",
 'ours',
 'a',
 'about',
 'all',
 'any',
 'i',
 'our',
 'here',
 "aren't",
 'and',
 'ourselves',
 'itself',
 'on',
 'under',
 'one',
 "you'll",
 'too',
 'this',
 'after',
 'then',
 'should',
 "that'll",
 'me',
 'why',
 'your',
 'until',
 "won't",
 'further',
 'you',
 "needn't",
 "mightn't",
 'they',
 "don't",
 'each',
 'same',
 'had',
 'just',
 "wouldn't",
 'my',
 'into',
 'that',
 'are',
 "you've",
 'than',
 'do',
 'as',
 'the',
 'them',
 'there',
 'does',
 'some',
 'themselves',
 "weren't",
 "you'd",
 'through',
 'below',
 'in',
 'don',
 "wasn't",
 'an',
 'were',
 'for',
 'has',
 'very',
 'before',
 'or',
 'what',
 "hadn't",
 "doesn't",
 "shan't",
 'having',
 'no',
 'not',
 'well',
 'will',
 'over',
 'which',
 'yourselves',
 'once',
 'am',
 'above',
 'of',
 'other',
 "it's",
 'is',
 'have',
 'much',
 'out',
 'would',
 'by',
 'again',
 "hasn't",
 'myself',
 'down',
 'could',
 'theirs'

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [29]:
# Define batch size
batch_size = 10000

# Process in batches
for start in range(0, len(master_df), batch_size):
    end = min(start + batch_size, len(master_df))
    batch = master_df.iloc[start:end]
    
    for index, row in batch.iterrows():
        text = row["text"]
        tokens = word_tokenize(text)
        tokens = [token.lower() for token in tokens if token.isalpha()] 
        
        for x in top_stops:
            count_x = tokens.count(x) 
            rel_freq = count_x / len(tokens) 
            master_df.at[index, x] = rel_freq  

    print(f"Processed {end} sentences out of {len(master_df)}")


Processed 10000 sentences out of 86882
Processed 20000 sentences out of 86882
Processed 30000 sentences out of 86882
Processed 40000 sentences out of 86882
Processed 50000 sentences out of 86882
Processed 60000 sentences out of 86882
Processed 70000 sentences out of 86882
Processed 80000 sentences out of 86882
Processed 86882 sentences out of 86882


Unnamed: 0,id,text,category,author,book/prompt,model,nation,gender,race,mean_sen_len,...,if,upon,didn't,both,say,said,ask,asked,reply,replied
0,1,"_Chapter I_ JEAN MUIR ""Has she come?"" ""No, ...",authentic,alcott,behind a mask,authentic,American,female,white,10.785714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006803,0.0,0.0
1,2,"asked Lucia. ""Leave the house while she is in ...",authentic,alcott,behind a mask,authentic,American,female,white,16.0,...,0.016529,0.0,0.0,0.0,0.008264,0.0,0.0,0.008264,0.0,0.0
2,3,"She is a nice person, I dare say, and when onc...",authentic,alcott,behind a mask,authentic,American,female,white,14.6,...,0.0,0.0,0.0,0.0,0.007143,0.007143,0.0,0.0,0.0,0.0
3,4,"""Too late, Bella, the train was in some time a...",authentic,alcott,behind a mask,authentic,American,female,white,13.272727,...,0.0,0.0,0.0,0.0,0.007042,0.007042,0.0,0.0,0.0,0.0
4,5,"We'll stand by poor little Muir, won't we?"" An...",authentic,alcott,behind a mask,authentic,American,female,white,18.125,...,0.007143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Relative Frequencies of Parts of Speech #

In [33]:
# List of all POS tags
all_pos_tags = [
    'NN', 'NNS', 'NNP', 'NNPS',  # Nouns
    'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',  # Verbs
    'JJ', 'JJR', 'JJS',  # Adjectives
    'RB', 'RBR', 'RBS',  # Adverbs
    'PRP', 'PRP$', 'IN', 'DT',  # Pronouns, prepositions, determiners
    # Add more tags as needed
]


In [34]:
# Initialize columns for POS frequencies
for pos in all_pos_tags:
    master_df[pos] = 0.0

# Calculate relative frequencies of POS tags
for index, row in master_df.iterrows():
    text = row["text"]
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Filter alphabetic words
    pos_tags = pos_tag(tokens)  # Get POS tags
    
    # Count occurrences of each POS tag
    pos_counts = Counter(tag for _, tag in pos_tags if tag in all_pos_tags)
    
    # Calculate relative frequencies
    total_tokens = len(tokens)
    for pos, count in pos_counts.items():
        master_df.at[index, pos] = count / total_tokens if total_tokens > 0 else 0

    # Progress report (optional)
    if (index + 1) % 10000 == 0:
        print(f"Processed {index + 1} texts out of {len(master_df)}")


Processed 10000 texts out of 86882
Processed 20000 texts out of 86882
Processed 30000 texts out of 86882
Processed 40000 texts out of 86882
Processed 50000 texts out of 86882
Processed 60000 texts out of 86882
Processed 70000 texts out of 86882
Processed 80000 texts out of 86882
   id                                               text   category  author  \
0   1  _Chapter I_   JEAN MUIR  "Has she come?" "No, ...  authentic  alcott   
1   2  asked Lucia. "Leave the house while she is in ...  authentic  alcott   
2   3  She is a nice person, I dare say, and when onc...  authentic  alcott   

     book/prompt      model    nation  gender   race mean_sen_len  ...  \
0  behind a mask  authentic  American  female  white    10.785714  ...   
1  behind a mask  authentic  American  female  white         16.0  ...   
2  behind a mask  authentic  American  female  white         14.6  ...   

         JJ       JJR  JJS        RB       RBR  RBS       PRP      PRP$  \
0  0.122449  0.000000  0.0  0.0

In [35]:
master_df.to_csv("data/master_feature_matrix.csv")