In [None]:
!ls corpus_chunks

In [1]:
import pandas as pd

In [2]:
samples_df = pd.DataFrame()

In [3]:
samples_df["ID"] = None

In [4]:
samples_df.head()

In [5]:
from pathlib import Path
import nltk

# Download the tokenizer models if not already done
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Initialize the list to store filenames with fewer than 500 tokens
shorts = []
ids = []

import os

# Path to the folder containing files
folder_path = 'corpus_chunks' 

# Get a list of all filenames in the folder
filenames = os.listdir(folder_path)

for filename in filenames:
    text =  open(os.path.join(folder_path, filename), 'r').read()
    tokens = word_tokenize(text)
    num_tokens = len(tokens)
    if num_tokens < 500:
        shorts.append(filename)  # Append the filename to the list
    else:
        ids.append(filename)
            
            
print(ids[:20])

In [20]:
samples_df['ID'] = ids

In [21]:
samples_df.head()

In [12]:
samples_df = samples_df.sort_values(by='ID', ascending=True)

In [22]:
samples_df.head()

In [23]:
samples_df = samples_df.set_index('ID', inplace=False)

In [24]:
samples_df.head()

In [55]:
samples_df["nation"] = 0

british = ["dickens", "stoker", "austen", "bronte", "gaskell"]
american = ["alcott", "hopkins", "twain", "griggs", "chesnutt"]

for idx in samples_df.index:
    name = str(idx).split("_")[0].lower()
    if name in british:
        samples_df.loc[idx, "nation"] = "British/Irish"
    else:
        samples_df.loc[idx, "nation"] = "American"

In [58]:
# Define a function to apply the condition
def check_synthetic(value):
    if 'synthetic' in value:
        return 'synthetic'
    else:
        return 'authentic'

# Apply the function to the 'text' column and create a new column 'status'
samples_df['category'] = samples_df.index.to_series().apply(check_synthetic)

samples_df.head(-50)

In [35]:
samples_df["mean_sen_len"] = 0
from nltk.tokenize import sent_tokenize

for file in filenames:
    text = open(os.path.join("corpus_chunks", file), "r").read()
    sentences = sent_tokenize(text)
    tokens = word_tokenize(text)
    mean_sen_len = (len(tokens))/(len(sentences))
    samples_df.loc[file, "mean_sen_len"] = mean_sen_len
    

In [36]:
samples_df["male_pronouns"] = 0

male_pronouns = ["him", "his", "he"]

for file in filenames:
    text = open(os.path.join("corpus_chunks", file), "r").read()
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words
    
    # Iterate through each content word to count its occurrences
    for x in male_pronouns:
        count_x = tokens.count(x)  # Use list count function for counting occurrences
        rel_freq = (count_x/(len(tokens)))
        samples_df.loc[file, "male_pronouns"] = rel_freq  # Set the count in the correct row and column

In [38]:
samples_df["female_pronouns"] = 0

female_pronouns = ["her", "hers", "she"]

for file in filenames:
    text = open(os.path.join("corpus_chunks", file), "r").read()
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words
    
    # Iterate through each content word to count its occurrences
    for x in female_pronouns:
        count_x = tokens.count(x)  # Use list count function for counting occurrences
        rel_freq = (count_x/(len(tokens)))
        samples_df.loc[file, "female_pronouns"] = rel_freq  # Set the count in the correct row and column

In [40]:
samples_df["TTR"] = 0

for file in filenames:
    text = open(os.path.join("corpus_chunks", file), "r").read()
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words
    types = set(tokens)
    total_tokens = len(tokens)
    ttr = len(types) / total_tokens
    samples_df.loc[file, "TTR"] = ttr

In [42]:
samples_df["lex_density"] = 0

from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

for file in filenames:
    text = open(os.path.join("corpus_chunks", file), "r").read()
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words
    pos_tags = pos_tag(tokens)
    lexical_pos = {'NN', 'NNS', 'NNP', 'NNPS',  # Nouns
                   'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',  # Verbs
                   'JJ', 'JJR', 'JJS',  # Adjectives
                   'RB', 'RBR', 'RBS'}  # Adverbs
    
    lexical_words = [word for word, pos in pos_tags if pos in lexical_pos]
    total_words = len(tokens)
    lexical_density = len(lexical_words) / total_words
    samples_df.loc[file, "lex_density"] = lexical_density

In [43]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

samples_df["VADER_sentiment"] = 0

for file in filenames:
    text = open(os.path.join("corpus_chunks", file), "r").read()
    sentiment_scores = SentimentIntensityAnalyzer().polarity_scores(text)
    sentiment_score = sentiment_scores['compound']
    samples_df.loc[file, "VADER_sentiment"] = sentiment_score



samples_df['VADER_sentiment']

In [54]:
samples_df["gender"] = 0

women_writers = ["alcott", "gaskell", "austen", "bronte", "hopkins"]
male_writers = ["dickens", "stoker", "twain", "griggs", "chesnutt"]

for idx in samples_df.index:
    name = str(idx).split("_")[0].lower()
    if name in women_writers:
        samples_df.loc[idx, "gender"] = "female"
    else:
        samples_df.loc[idx, "gender"] = "male"

In [48]:
concreteness_df = pd.read_excel("brysbaert_concreteness.xlsx")

concreteness_dict = dict(zip(concreteness_df['Word'].str.lower(), concreteness_df['Conc.M']))

for file in filenames:
    text = open(os.path.join("corpus_chunks", file), "r").read()
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words
    concreteness_scores = [concreteness_dict.get(token) for token in tokens if token in concreteness_dict]
    avg_concreteness = sum(concreteness_scores) / len(concreteness_scores)
    samples_df.loc[file, "concreteness"] = avg_concreteness

In [49]:
samples_df.columns.values

In [66]:
samples_df["concreteness"]

In [67]:
content_words_file = open("content_words.txt", "r").read()
content_words = content_words_file.split("\n")
content_words[:20]

In [69]:
# Create columns for each word in content_words if they don't already exist
for x in content_words:
    if x not in samples_df.columns:
        samples_df[x] = 0  # Initialize the new column with 0

# Iterate over each file in the file list
for file in filenames:
    text = open(os.path.join("corpus_chunks", file), "r").read()
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words
    
    # Iterate through each content word to count its occurrences
    for x in content_words:
        count_x = tokens.count(x)  # Use list count function for counting occurrences
        rel_freq = (count_x/(len(tokens)))
        samples_df.loc[file, x] = rel_freq  # Set the count in the correct row and column

In [68]:
top_stops_file = open("top_stops.txt", "r").read()
top_stops = top_stops_file.split("\n")
top_stops[:20]

In [73]:
# Create columns for each word in content_words if they don't already exist
for x in top_stops:
    if x not in samples_df.columns:
        samples_df[x] = 0  # Initialize the new column with 0

# Iterate over each file in the file list
for file in filenames:
    text = open(os.path.join("corpus_chunks", file), "r").read()
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words
    
    # Iterate through each content word to count its occurrences
    for x in top_stops:
        count_x = tokens.count(x)  # Use list count function for counting occurrences
        rel_freq = (count_x/(len(tokens)))
        samples_df.loc[file, x] = rel_freq  # Set the count in the correct row and column

In [74]:
samples_df.info()

In [75]:
samples_df.columns.values

In [76]:
samples_df.to_csv("master_features_chunks.csv")

In [77]:
samples_df.head()