# 🧠 Spooky Author Identification

## Objective:
###### Predict the author of horror story excerpts written by Edgar Allan Poe, Mary Shelley, or H.P. Lovecraft.

###### The dataset consists of text samples extracted from public domain works by these three iconic authors. To create the dataset, longer texts were segmented into smaller excerpts—primarily sentences—using the MaxEnt sentence tokenizer from CoreNLP. As a result, some fragments may not be complete sentences.

###### Your goal is to build a model that can accurately determine which author wrote each excerpt in the test set.

###### Dataset Files:

###### train.csv — contains labeled training excerpts with corresponding author labels (EAP, HPL, MWS)

###### test.csv — contains unlabeled excerpts for which author predictions are required




In [None]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from nltk import pos_tag, bigrams, FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

%matplotlib inline

In [None]:
train = pd.read_csv("train.csv")
train.head()

In [None]:
train.describe()

In [None]:
train['tokens'] = train['text'].str.lower().apply(lambda x: TextBlob(x).words)
train['nb_tokens'] = train['tokens'].apply(len)
train['word_length'] = train['tokens'].apply(lambda x: np.mean([len(word) for word in x]))
train['sentences'] = train['text'].apply(lambda x: TextBlob(x).sentences)
train['nb_sentences'] = train['sentences'].apply(len)

In [None]:
grouped = train.groupby('author')['nb_tokens'].agg(['sum', 'count'])
grouped['mean_words'] = grouped['sum'] / grouped['count']

f, axarr = plt.subplots(2, sharex=True)
grouped['count'].plot(kind='bar', ax=axarr[0], title='Number of extracts per author')
grouped['mean_words'].plot(kind='bar', ax=axarr[1], title='Mean amount of words per author')
plt.tight_layout()
plt.show()

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.update(['one', "'s"])
train['useful'] = train['tokens'].apply(lambda tokens: [w for w in tokens if w not in stop_words])

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

def translate_tag_pos(tup):
    tag_map = {'N': 'n', 'V': 'v', 'R': 'r', 'J': 'a'}
    for key in tag_map:
        if tup[1].startswith(key):
            return (tup[0], tag_map[key])
    return None

def lemmatize_with_new_tags(tags):
    lemmas = []
    for t in tags:
        new_tag = translate_tag_pos(t)
        if new_tag:
            lemmas.append(wordnet_lemmatizer.lemmatize(new_tag[0], pos=new_tag[1]))
        else:
            lemmas.append(t[0])
    return lemmas

train['tags'] = train['useful'].apply(pos_tag)
train['lemma'] = train['tags'].apply(lemmatize_with_new_tags)
train['vocab_wealth'] = train.apply(lambda row: len(set(row['lemma'])) / len(row['tokens']) * 100, axis=1)

In [None]:
EAP = sum(train[train.author == "EAP"]['lemma'].tolist(), [])
HPL = sum(train[train.author == "HPL"]['lemma'].tolist(), [])
MWS = sum(train[train.author == "MWS"]['lemma'].tolist(), [])

hpl_freq = FreqDist(HPL)
poe_freq = FreqDist(EAP)
mws_freq = FreqDist(MWS)

hpl_bg_freq = FreqDist(bigrams(HPL))
poe_bg_freq = FreqDist(bigrams(EAP))
mws_bg_freq = FreqDist(bigrams(MWS))

def compute_author_features(row):
    lemma = row['lemma']
    bg = list(bigrams(lemma))
    n = len(lemma)
    nb = len(bg)

    return pd.Series({
        'hpl_word_count': sum(hpl_freq[w] for w in lemma) / n if n else 0,
        'poe_word_count': sum(poe_freq[w] for w in lemma) / n if n else 0,
        'mws_word_count': sum(mws_freq[w] for w in lemma) / n if n else 0,
        'hpl_bigram_count': sum(hpl_bg_freq[b] for b in bg) / nb if nb else 0,
        'poe_bigram_count': sum(poe_bg_freq[b] for b in bg) / nb if nb else 0,
        'mws_bigram_count': sum(mws_bg_freq[b] for b in bg) / nb if nb else 0,
    })

train = pd.concat([train, train.apply(compute_author_features, axis=1)], axis=1)

In [None]:
encoder = preprocessing.LabelEncoder()
train['target'] = encoder.fit_transform(train['author'])

features = ['nb_tokens', 'word_length', 'nb_sentences', 'vocab_wealth',
            'hpl_word_count', 'poe_word_count', 'mws_word_count',
            'hpl_bigram_count', 'poe_bigram_count', 'mws_bigram_count']

x_train, x_validate, y_train, y_validate = train_test_split(train[features], train['target'], test_size=0.25)
rfc = RandomForestClassifier(n_estimators=800, max_depth=20, max_features='sqrt')
rfc.fit(x_train, y_train)

prediction = rfc.predict(x_validate)
validate = train.loc[x_validate.index].copy()
validate['target_predict'] = prediction
validate['predicted_author'] = encoder.inverse_transform(validate['target_predict'])

print(classification_report(validate['target'], validate['target_predict'], target_names=encoder.classes_))