### Importing necessory packages and libraries

In [13]:
import nltk
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score,make_scorer
from nltk.corpus import brown
from statistics import stdev, mean
from collections import Counter

### Function for loading brown corpus and assigning labels as fiction and non-fiction

In [14]:
def load_brown_corpus():
    """Load and process Brown corpus data"""
    fiction_categories = ['fiction', 'mystery', 'romance', 'adventure', 'science_fiction']
    nonfiction_categories = ['news', 'reviews', 'hobbies', 'government', 'learned']
    
    texts = []
    labels = []
    
    for category in brown.categories():
        if category in fiction_categories:
            for fileid in brown.fileids(category):
                words = brown.words(fileid)
                texts.append(' '.join(words))
                labels.append(1) # fiction as 1
        elif category in nonfiction_categories:
            for fileid in brown.fileids(category):
                words = brown.words(fileid)
                texts.append(' '.join(words))
                labels.append(0) #non-fiction as 0
    
    return texts, labels

### Extracting most significat features (only adverb_adjective ratio and adjective_pronoun ratio )

In [16]:
def extract_two_features(text):
    """Extract POS tag ratios (only two most significant features)"""
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    
    adverb_count = sum(1 for _, tag in pos_tags if tag.startswith('RB'))
    adjective_count = sum(1 for _, tag in pos_tags if tag.startswith('JJ'))
    pronoun_count = sum(1 for _, tag in pos_tags if tag.startswith(('PRP', 'WP')))
    
    if adjective_count>0:
        adv_adj_ratio = adverb_count / adjective_count 
    else:
        adv_adj_ratio=0
    if pronoun_count>0:
        adj_pro_ratio = adjective_count / pronoun_count 
    else:
        adj_pro_ratio=0
    
    return [adv_adj_ratio, adj_pro_ratio]

### Extracting Low level features as mentioned in the text

In [17]:

def extract_low_level_features(text):
    """Extract all low-level features"""
    # Tokenization
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    
    # Basic calculations
    sentence_lengths = [len(nltk.word_tokenize(sent)) for sent in sentences]
    word_lengths = [len(word) for word in words if word.isalnum()]
    
    # Token/type calculations
    tokens = [word.lower() for word in words]
    types = set(tokens)
    token_type_ratio = len(tokens) / len(types) if types else 0
    
    # Punctuation calculations
    punct_tokens = [word for word in words if not word.isalnum()]
    punct_types = set(punct_tokens)
    punct_ratio = len(punct_tokens) / len(punct_types) if punct_types else 0
    
    # Count specific punctuation marks
    punct_counts = Counter(punct_tokens)
    hyphen_count = punct_counts['-']
    quote_count = punct_counts['"'] + punct_counts["'"]
    exclaim_count = punct_counts['!']
    question_count = punct_counts['?']
    
    return [
        mean(sentence_lengths) if sentence_lengths else 0,  # Avg sentence length
        mean(word_lengths) if word_lengths else 0,  # Avg word length
        stdev(sentence_lengths) if len(sentence_lengths) > 1 else 0,  # StdDev sentence length
        stdev(word_lengths) if len(word_lengths) > 1 else 0,  # StdDev word length
        token_type_ratio,  # Avg token/type
        0,  # StdDev token/type (calculated over windows in practice)
        punct_ratio,  # Avg token/type (punctuation)
        0,  # StdDev token/type punctuation (calculated over windows in practice)
        hyphen_count / (quote_count + 1),  # Hyphen/Quote
        hyphen_count / (exclaim_count + 1),  # Hyphen/Exclamation
        quote_count / (question_count + 1)  # Quote/Question
    ]

### Extracting pos-based features

In [18]:
def extract_pos_features(text):
    """Extract POS-based features"""
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    
    # Count POS tags
    adverb_count = sum(1 for _, tag in pos_tags if tag.startswith('RB'))
    adjective_count = sum(1 for _, tag in pos_tags if tag.startswith('JJ'))
    noun_count = sum(1 for _, tag in pos_tags if tag.startswith('NN'))
    verb_count = sum(1 for _, tag in pos_tags if tag.startswith('VB'))
    pronoun_count = sum(1 for _, tag in pos_tags if tag.startswith(('PRP', 'WP')))
    
    # Calculate ratios (adding 1 to denominators to avoid division by zero)
    features = [
        adverb_count / (noun_count + 1),      # Adverb/Noun
        adverb_count / (pronoun_count + 1),    # Adverb/Pronoun
        adjective_count / (verb_count + 1),    # Adjective/Verb
        noun_count / (verb_count + 1),         # Noun/Verb
        verb_count / (pronoun_count + 1),      # Verb/Pronoun
        adverb_count / (adjective_count + 1),  # Adverb/Adjective
        adjective_count / (pronoun_count + 1), # Adjective/Pronoun
        noun_count / (pronoun_count + 1)       # Noun/Pronoun
    ]
    
    return features

### Extracting 6 Features as mentioned in the paper 
- Avg sentence length, 
- Avg word length, 
- Hyphen/Quote, 
- Adverb/Adjective,  
- Adjective/Pronoun, 
- Noun/Pronoun

In [19]:
def extract_six_features(text):
    """Extract the 6 specified features"""
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    sentences = nltk.sent_tokenize(text)
    
    # Basic features
    sentence_lengths = [len(nltk.word_tokenize(sent)) for sent in sentences]
    word_lengths = [len(word) for word in tokens if word.isalnum()]
    
    # Punctuation counts
    punct_counts = Counter(token for token in tokens if not token.isalnum())
    hyphen_count = punct_counts['-']
    quote_count = punct_counts['"'] + punct_counts["'"]
    
    # POS counts
    adverb_count = sum(1 for _, tag in pos_tags if tag.startswith('RB'))
    adjective_count = sum(1 for _, tag in pos_tags if tag.startswith('JJ'))
    noun_count = sum(1 for _, tag in pos_tags if tag.startswith('NN'))
    pronoun_count = sum(1 for _, tag in pos_tags if tag.startswith(('PRP', 'WP')))
    
    return [
        mean(sentence_lengths) if sentence_lengths else 0,  # Avg sentence length
        mean(word_lengths) if word_lengths else 0,  # Avg word length
        hyphen_count / (quote_count + 1),  # Hyphen/Quote
        adverb_count / (adjective_count + 1),  # Adverb/Adjective
        adjective_count / (pronoun_count + 1),  # Adjective/Pronoun
        noun_count / (pronoun_count + 1)  # Noun/Pronoun
    ]

### selecting which type of fetures to be extracted 

In [20]:
def extract_features_by_type(text, feature_type='all'):
    """Extract features based on specified type"""
    if feature_type == 'two':
        return extract_two_features(text)
    elif feature_type == 'low_level':
        return extract_low_level_features(text)
    elif feature_type == 'nineteen':
        return extract_low_level_features(text) + extract_pos_features(text)
    elif feature_type == 'six':
        return extract_six_features(text)
    else:
        raise ValueError("Invalid feature type. Choose 'two', 'low_level', 'nineteen', or 'six'")


### Running the model on given datasets as arguments and giving performance matrices after ten-fold cross validation

In [21]:
def run_experiment(X, y):
    """Run experiment with cross-validation using Logistic Regression"""
    
    # Initialize model
    model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
    
    # Create stratified 10-fold cross-validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    # Custom scoring functions
    f1_nonfiction_scorer = make_scorer(f1_score, pos_label=0)
    f1_fiction_scorer = make_scorer(f1_score, pos_label=1)
    # Perform cross-validation for each metric
    accuracy_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    f1_nonfiction_scores = cross_val_score(model, X, y, cv=cv, scoring=f1_nonfiction_scorer)
    f1_fiction_scores = cross_val_score(model, X, y, cv=cv, scoring=f1_fiction_scorer)
    
    # Calculate baseline accuracy for each fold
    baseline_accuracies = []
    for train_idx, test_idx in cv.split(X, y):
        y_test = y[test_idx]
        baseline = max(np.sum(y_test == 0) / len(y_test),
                    np.sum(y_test == 1) / len(y_test))
        baseline_accuracies.append(baseline)
    
    return {
        'accuracy': np.mean(accuracy_scores),
        'accuracy_std': np.std(accuracy_scores),
        'f1_nonfiction': np.mean(f1_nonfiction_scores),
        'f1_nonfiction_std': np.std(f1_nonfiction_scores),
        'f1_fiction': np.mean(f1_fiction_scores),
        'f1_fiction_std': np.std(f1_fiction_scores),
        'baseline': np.mean(baseline_accuracies),
        'baseline_std': np.std(baseline_accuracies)
    }

### Deviding datasets similar to given in paper and feeding to the model, storing output

In [22]:
def run_experiments_for_feature_set(brown_texts, brown_labels, bnc_texts, bnc_labels, feature_type):
    """Run all experiments for a given feature set"""
    print(f"\nExtracting {feature_type} features...")
    brown_features = np.array([extract_features_by_type(text, feature_type) for text in brown_texts])
    bnc_features = np.array([extract_features_by_type(text, feature_type) for text in bnc_texts])
    
    # Run experiments
    print(f"Running experiments for {feature_type} features...")
    
    # 1. Brown Corpus only
    results_brown = run_experiment(brown_features, brown_labels)
    
    # 2. Brown + BNC combined
    combined_features = np.vstack((brown_features, bnc_features))
    combined_labels = np.concatenate((brown_labels, bnc_labels))
    results_combined = run_experiment(combined_features, combined_labels)
    
    # 3. Train on Brown, test on BNC
    model = LogisticRegression(penalty='l1', solver='liblinear')
    model.fit(brown_features, brown_labels)
    bnc_pred = model.predict(bnc_features)
    results_bnc = {
        'accuracy': accuracy_score(bnc_labels, bnc_pred),
        'f1_nonfiction': f1_score(bnc_labels, bnc_pred, pos_label=0),
        'f1_fiction': f1_score(bnc_labels, bnc_pred, pos_label=1),
        'baseline': len(bnc_labels[bnc_labels == 0])/len(bnc_labels)
    }
    
    return {
        'brown': results_brown,
        'combined': results_combined,
        'bnc': results_bnc
    }

### Printing the results

In [23]:
def print_results(all_results):
    """Print results for all feature types and experiments in a organized way"""
    feature_names = {
        'two': 'Original Two Features (Adverb/Adjective, Adjective/Pronoun)',
        'low_level': 'All Low Level Features (11 features)',
        'nineteen': '19 Features (Combined)',
        'six': '6 Selected Features'
    }
    
    for feature_type, results in all_results.items():
        print(f"\n{'='*80}")
        print(f"Results for {feature_names[feature_type]}:")
        print('='*80)
        
        print("\nBrown Corpus (60-40 split):")
        print(f"Accuracy: {100*results['brown']['accuracy']:.2f} ± {results['brown']['accuracy_std']:.2f}")
        print(f"F1 (non-fiction): {results['brown']['f1_nonfiction']:.4f} ± {results['brown']['f1_nonfiction_std']:.4f}")
        print(f"F1 (fiction): {results['brown']['f1_fiction']:.4f} ± {results['brown']['f1_fiction_std']:.4f}")
        print(f"Baseline: {100*results['brown']['baseline']:.2f} ± {results['brown']['baseline_std']:.2f}")
        
        print("\nBrown + BNC combined:")
        print(f"Accuracy: {100*results['combined']['accuracy']:.2f} ± {results['combined']['accuracy_std']:.2f}")
        print(f"F1 (non-fiction): {results['combined']['f1_nonfiction']:.4f} ± {results['combined']['f1_nonfiction_std']:.4f}")
        print(f"F1 (fiction): {results['combined']['f1_fiction']:.4f} ± {results['combined']['f1_fiction_std']:.4f}")
        print(f"Baseline: {100*results['combined']['baseline']:.2f} ± {results['combined']['baseline_std']:.2f}")
        
        print("\nTrain on Brown, Test on BNC:")
        print(f"Accuracy: {100*results['bnc']['accuracy']:.2f}")
        print(f"F1 (non-fiction): {results['bnc']['f1_nonfiction']:.4f}")
        print(f"F1 (fiction): {results['bnc']['f1_fiction']:.4f}")
        print(f"Baseline: {100*results['bnc']['baseline']:.2f}")

### Main execution

In [24]:
print("Loading data...")
brown_texts, brown_labels = load_brown_corpus()

df=pd.read_csv("baby_bnc.csv") #pre processed labeled csv file
bnc_texts=df["text"].tolist()
df['label'] = df['label'].map({'fiction': 1, 'non-fiction': 0}) #our csv has labels as fiction or non-fiction
bnc_labels = df['label'].tolist()

brown_labels = np.array(brown_labels)
bnc_labels = np.array(bnc_labels)

all_results = {}

feature_types = ['two', 'low_level', 'nineteen', 'six']

for feature_type in feature_types:
    all_results[feature_type] = run_experiments_for_feature_set(
        brown_texts, brown_labels, bnc_texts, bnc_labels, feature_type
    )

# Print all results in an organized way
print_results(all_results)

Loading data...

Extracting two features...
Running experiments for two features...

Extracting low_level features...
Running experiments for low_level features...

Extracting nineteen features...
Running experiments for nineteen features...

Extracting six features...
Running experiments for six features...

Results for Original Two Features (Adverb/Adjective, Adjective/Pronoun):

Brown Corpus (60-40 split):
Accuracy: 97.51 ± 0.03
F1 (non-fiction): 0.9801 ± 0.0221
F1 (fiction): 0.9665 ± 0.0357
Baseline: 63.89 ± 0.01

Brown + BNC combined:
Accuracy: 97.89 ± 0.02
F1 (non-fiction): 0.9830 ± 0.0185
F1 (fiction): 0.9722 ± 0.0304
Baseline: 62.53 ± 0.01

Train on Brown, Test on BNC:
Accuracy: 100.00
F1 (non-fiction): 1.0000
F1 (fiction): 1.0000
Baseline: 54.55

Results for All Low Level Features (11 features):

Brown Corpus (60-40 split):
Accuracy: 94.46 ± 0.04
F1 (non-fiction): 0.9562 ± 0.0324
F1 (fiction): 0.9244 ± 0.0536
Baseline: 63.89 ± 0.01

Brown + BNC combined:
Accuracy: 95.25 ± 0.03