In [1]:
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from textblob import Word
from collections import Counter

In [2]:
# Reading the given data
df=pd.read_csv("all.csv")
df.head()

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


In [3]:
df.info()
# Only 2 null data appears in the whole df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573 entries, 0 to 572
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   author     573 non-null    object
 1   content    573 non-null    object
 2   poem name  571 non-null    object
 3   age        573 non-null    object
 4   type       573 non-null    object
dtypes: object(5)
memory usage: 22.5+ KB


In [4]:
# Determining indexes of null values
np.where(df['poem name'].isnull())[0]

array([354, 489], dtype=int64)

In [5]:
# Dropping Null values
df.dropna(inplace=True)

In [6]:
def processing(df,col,content=1):
    # Casefold
    df[col]=df[col].apply(lambda x: ' '.join(i.lower() for i in x.split()))

    # Punctuation marks
    df[col]=df[col].apply(lambda x: ''.join([i for i in x  if i.isalnum() or i==' ']))

    # Numbers
    df[col]=df[col].str.replace('\d', '')

    # Cleaning stopwords
    sw=stopwords.words('english')
    df[col]=df[col].apply(lambda x: ' '.join([i for i in x.split() if i not in sw]))
    
    if content: # if the col is content of the poem
        # Less used words
        sozler=pd.Series((' '.join(df[col].values)).split()).value_counts().tail(1000)

        df[col]=df[col].apply(lambda x: ' '.join(i for i in x.split() if i not in sozler.index))

    # Cleaning 1 letter words (abbr. like words)
    df[col]=df[col].apply(lambda x: ' '.join(i for i in x.split() if len(i)>1))

    # lemmatizing
    df[col]=df[col].apply(lambda x: ' '.join([Word(i).lemmatize() for i in x.split()]))
    
    return df

In [7]:
# Raw Content
df.content[0]

'Let the bird of loudest lay\r\nOn the sole Arabian tree\r\nHerald sad and trumpet be,\r\nTo whose sound chaste wings obey.\r\n\r\nBut thou shrieking harbinger,\r\nFoul precurrer of the fiend,\r\nAugur of the fever\'s end,\r\nTo this troop come thou not near.\r\n\r\nFrom this session interdict\r\nEvery fowl of tyrant wing,\r\nSave the eagle, feather\'d king;\r\nKeep the obsequy so strict.\r\n\r\nLet the priest in surplice white,\r\nThat defunctive music can,\r\nBe the death-divining swan,\r\nLest the requiem lack his right.\r\n\r\nAnd thou treble-dated crow,\r\nThat thy sable gender mak\'st\r\nWith the breath thou giv\'st and tak\'st,\r\n\'Mongst our mourners shalt thou go.\r\n\r\nHere the anthem doth commence:\r\nLove and constancy is dead;\r\nPhoenix and the Turtle fled\r\nIn a mutual flame from hence.\r\n\r\nSo they lov\'d, as love in twain\r\nHad the essence but in one;\r\nTwo distincts, division none:\r\nNumber there in love was slain.\r\n\r\nHearts remote, yet not asunder;\r\nDis

In [8]:
df=processing(df,"content")
df=processing(df,"poem name",0)

In [9]:
# Cleaned Content
df.content[0]

'let bird loudest lay sole arabian tree herald sad trumpet whose sound chaste wing obey thou shrieking harbinger foul precurrer fiend augur fever end troop come thou near session interdict every fowl tyrant wing save eagle featherd king keep obsequy strict let priest surplice white defunctive music deathdivining swan lest requiem lack right thou trebledated crow thy sable gender makst breath thou givst takst mongst mourner shalt thou go anthem doth commence love constancy dead phoenix turtle fled mutual flame hence lovd love twain essence one two distincts division none number love slain heart remote yet asunder distance space seen twixt turtle queen wonder love shine turtle saw right flaming phoenix sight either others mine property thus appalled self single nature double name neither two one called reason confounded saw division grow together yet either neither simple well compounded cried true twain seemeth concordant one love reason reason none part remain whereupon made threne pho

In [10]:
# Taking the distinct values of author column
authors=df.author.value_counts().index


# Feature Engineering

In [11]:
# Counting the most used words in each content
results = Counter()
cont_words=[]

for i in df.content:
    results.update(i.split())
    cont_words.append(results.most_common()[0][0])
    results.clear()

cont_words

['love',
 'queen',
 'vice',
 'knight',
 'see',
 'pure',
 'praisd',
 'go',
 'thou',
 'wolde',
 'ye',
 'love',
 'thou',
 'day',
 'bene',
 'thy',
 'thou',
 'made',
 'one',
 'thou',
 'doth',
 'horse',
 'want',
 'sport',
 'theyr',
 'thou',
 'merrily',
 'love',
 'fire',
 'thy',
 'shall',
 'fie',
 'green',
 'ye',
 'thy',
 'gut',
 'praisd',
 'swan',
 'thou',
 'cuckoo',
 'rain',
 'thy',
 'thou',
 'spring',
 'go',
 'crooked',
 'love',
 'love',
 'black',
 'fish',
 'like',
 'soul',
 'love',
 'love',
 'doth',
 'strait',
 'doth',
 'seek',
 'think',
 'live',
 'day',
 'bene',
 'thy',
 'doth',
 'arise',
 'note',
 'made',
 'come',
 'sun',
 'thou',
 'like',
 'thy',
 'thou',
 'thy',
 'love',
 'lullaby',
 'heart',
 'thy',
 'thou',
 'love',
 'poet',
 'might',
 'one',
 'much',
 'mee',
 'theyr',
 'hee',
 'thy',
 'love',
 'eye',
 'thy',
 'see',
 'pure',
 'sometime',
 'love',
 'youth',
 'thing',
 'happy',
 'new',
 'lyke',
 'made',
 'new',
 'ye',
 'love',
 'ye',
 'fayre',
 'absence',
 'world',
 'earth',
 'loe',


In [12]:
# New column for the mostly used words in each content
df["most_word"]=cont_words
df.head()

Unnamed: 0,author,content,poem name,age,type,most_word
0,WILLIAM SHAKESPEARE,let bird loudest lay sole arabian tree herald ...,phoenix turtle,Renaissance,Mythology & Folklore,love
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,sir chamber coming writing fairy queen praysai...,epilogue,Renaissance,Mythology & Folklore,queen
2,THOMAS BASTARD,vice run beyond old men saw far authentically ...,book epigram,Renaissance,Mythology & Folklore,vice
3,EDMUND SPENSER,lo man whose muse whilome maske time taught lo...,faerie queene book canto,Renaissance,Mythology & Folklore,knight
4,RICHARD BARNFIELD,long longd see love againe still wisht never c...,sonnet,Renaissance,Mythology & Folklore,see


In [13]:
df.most_word.value_counts()
# 252 distinct words, which means some words can be most common in several contents

love       57
thy        31
thou       24
michael    10
ezra        9
           ..
house       1
disdain     1
bronze      1
shadow      1
kiss        1
Name: most_word, Length: 252, dtype: int64

In [14]:
# Counting the most common word in the names of poems for each author
counter = Counter()
name_words=[]
aut_poems={}
for i in df.author.value_counts().index:
    aut_poems[i]=[]

for i in df.author.value_counts().index:
    dff=df[df["author"]==i]["poem name"]
    dff.str.split().apply(counter.update)
    aut_poems[i].extend([counter.most_common()[0][0]])
    counter.clear()

In [15]:
# Each author has most common word that is used in the naming of poems
aut_poems

{'WILLIAM SHAKESPEARE': ['sonnet'],
 'SIR PHILIP SIDNEY': ['stella'],
 'JOHN DONNE': ['elegy'],
 'EDMUND SPENSER': ['amoretti'],
 'WILLIAM BUTLER YEATS': ['song'],
 'SIR THOMAS WYATT': ['love'],
 'EZRA POUND': ['canto'],
 'CARL SANDBURG': ['momus'],
 'THOMAS CAMPION': ['follow'],
 'WALLACE STEVENS': ['idea'],
 'HART CRANE': ['bridge'],
 'SARA TEASDALE': ['since'],
 'D. H. LAWRENCE': ['love'],
 'EN JONSON': ['celia'],
 'PAUL LAURENCE DUNBAR': ['love'],
 'IVOR GURNEY': ['england'],
 'LOUISE BOGAN': ['song'],
 'EDGAR LEE MASTERS': ['sarah'],
 'MICHAEL ANANIA': ['motet'],
 'SIR WALTER RALEGH': ['praisd'],
 'LADY MARY WROTH': ['pamphilia'],
 'MARJORIE PICKTHALL': ['wife'],
 'ARCHIBALD MACLEISH': ['ancestral'],
 'HUGH MACDIARMID': ['gairmscoile'],
 'SAMUEL DANIEL': ['delia'],
 'ELINOR WYLIE': ['full'],
 'QUEEN ELIZABETH I': ['fair'],
 'CHRISTOPHER MARLOWE': ['hero'],
 'KENNETH SLESSOR': ['ac'],
 'LOUIS UNTERMEYER': ['magic'],
 'E. E. CUMMINGS': ['carry'],
 'THOMAS BASTARD': ['book'],
 'GEORG

In [16]:
# New column poem_name_word -- for the most used word in poem names for each author
df["poem_name_word"]="0"
for i in aut_poems.keys():
    df.loc[df["author"]==i,"poem_name_word"]=aut_poems[i][0]


In [17]:
# New column poem_check -- if the word in poem_name_word is used in the poem_name
df["poem_check"]="0"
for i in df.author.value_counts().index:
    df.loc[df['author']==i,"poem_check"]=df.loc[df['author']==i, 'poem name'].str.contains(aut_poems[i][0]) 


In [18]:
df.head()

Unnamed: 0,author,content,poem name,age,type,most_word,poem_name_word,poem_check
0,WILLIAM SHAKESPEARE,let bird loudest lay sole arabian tree herald ...,phoenix turtle,Renaissance,Mythology & Folklore,love,sonnet,False
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,sir chamber coming writing fairy queen praysai...,epilogue,Renaissance,Mythology & Folklore,queen,epilogue,True
2,THOMAS BASTARD,vice run beyond old men saw far authentically ...,book epigram,Renaissance,Mythology & Folklore,vice,book,True
3,EDMUND SPENSER,lo man whose muse whilome maske time taught lo...,faerie queene book canto,Renaissance,Mythology & Folklore,knight,amoretti,False
4,RICHARD BARNFIELD,long longd see love againe still wisht never c...,sonnet,Renaissance,Mythology & Folklore,see,sonnet,True


In [19]:
#Showing the number of each poem in dataframe
df["poem name"].value_counts()

pamphilia amphilanthus                             6
sonnet                                             5
                                                   4
canto iv                                           3
astrophil stella sad step moon thou climbst sky    3
                                                  ..
amoretti lxvi happy blessing ye                    1
goodmorrow                                         1
ala madam stealing kiss                            1
political prisoner                                 1
carib isle                                         1
Name: poem name, Length: 495, dtype: int64

In [20]:
# Number of each content in df
con_val=df.content.value_counts() 

In [21]:
# New column num_content -- assigning the number of each content in dataframe
df["num_content"]=0
for i in range(len(con_val)):
    df.loc[df['content']==con_val.index[i],"num_content"]=con_val.values[i]


In [22]:
df.head()

Unnamed: 0,author,content,poem name,age,type,most_word,poem_name_word,poem_check,num_content
0,WILLIAM SHAKESPEARE,let bird loudest lay sole arabian tree herald ...,phoenix turtle,Renaissance,Mythology & Folklore,love,sonnet,False,2
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,sir chamber coming writing fairy queen praysai...,epilogue,Renaissance,Mythology & Folklore,queen,epilogue,True,1
2,THOMAS BASTARD,vice run beyond old men saw far authentically ...,book epigram,Renaissance,Mythology & Folklore,vice,book,True,1
3,EDMUND SPENSER,lo man whose muse whilome maske time taught lo...,faerie queene book canto,Renaissance,Mythology & Folklore,knight,amoretti,False,1
4,RICHARD BARNFIELD,long longd see love againe still wisht never c...,sonnet,Renaissance,Mythology & Folklore,see,sonnet,True,2


In [23]:
# Dropping content and poem name columns to avoid overfitting as most of their values are unique
df.drop(["content","poem name"],axis=1,inplace=True)


In [24]:
df.shape #Shape of dataframe after processing

(571, 7)

# Modeling

In [25]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from lightgbm import LGBMRegressor
import re
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

from sklearn .preprocessing import LabelEncoder

from sklearn.neural_network import MLPClassifier

In [26]:
#Encoding author column for prediction
df["author"]=LabelEncoder().fit_transform(df['author'])

In [27]:
# Method for classification models
def class_model(X, Y, algo, test_size=0.20):
    
    X_train, X_test, Y_train, Y_test=train_test_split(X, Y, test_size=test_size, random_state=42)
    algo.fit(X_train, Y_train)
    Y_train_pred=algo.predict(X_train)

    print(type(algo).__name__)
    Y_test_pred=algo.predict(X_test)
    test_acc=accuracy_score(Y_test, Y_test_pred)
    print("Test Score: {}".format(test_acc))
    print('###################################')
    return (type(algo).__name__, test_acc)

In [28]:
X=df.drop("author",axis=1)
Y=df[["author"]]

X=pd.get_dummies(X,drop_first=True)


In [29]:
X.shape #New shape of X 

(571, 313)

In [30]:
#Models of classification
models=[KNeighborsClassifier(),
        RandomForestClassifier(),
        svm.SVC(gamma="scale"),
        DecisionTreeClassifier(),
        LogisticRegression(),
        GaussianNB(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        XGBClassifier(),
        svm.LinearSVC(),
        MLPClassifier(hidden_layer_sizes=(100),max_iter=400)]
results={ 'model':[], 'test_score':[]}

In [31]:
for m in models:
    model,score=class_model(X,Y,m)
    results["model"].append(model)
    results["test_score"].append(score)

KNeighborsClassifier
Test Score: 0.808695652173913
###################################
RandomForestClassifier
Test Score: 0.9391304347826087
###################################
SVC
Test Score: 0.8782608695652174
###################################
DecisionTreeClassifier
Test Score: 0.9043478260869565
###################################
LogisticRegression
Test Score: 0.9043478260869565
###################################
GaussianNB
Test Score: 0.9304347826086956
###################################
AdaBoostClassifier
Test Score: 0.5130434782608696
###################################
GradientBoostingClassifier
Test Score: 0.9391304347826087
###################################
XGBClassifier
Test Score: 0.8695652173913043
###################################
LinearSVC
Test Score: 0.9478260869565217
###################################
MLPClassifier
Test Score: 0.9478260869565217
###################################


In [32]:
results_df=pd.DataFrame(results)
results_df.sort_values("test_score",ascending=False)

Unnamed: 0,model,test_score
9,LinearSVC,0.947826
10,MLPClassifier,0.947826
1,RandomForestClassifier,0.93913
7,GradientBoostingClassifier,0.93913
5,GaussianNB,0.930435
3,DecisionTreeClassifier,0.904348
4,LogisticRegression,0.904348
2,SVC,0.878261
8,XGBClassifier,0.869565
0,KNeighborsClassifier,0.808696


# Hypertuning

In [33]:
# Hyperparameters of some models
model_parameters={
    'Support Vector Machine': {
        'model': svm.SVC(gamma='scale'),
        'params': {'C': list(range(0,10))+ [0.1,0.5,0.001,0.0001],
                   'kernel': ['rbf','linear', 'poly']}},
    'Random Forest Classifier': {
        'model': RandomForestClassifier(),
        'params': {'n_estimators': list(range(10,30,5)),
                   'verbose': [0, 1],
                   'criterion': ['gini', 'entropy']}},
    'XGBoost Classifier': {
        'model': XGBClassifier(verbosity = 0),
        'params': {'gamma': list(range(0, 3)), 
                   'max_depth': list(range(1, 10)),
                   'eta': [0,1, 0.3, 0.5, 0.7, 1]}},
}

In [34]:
# Method for tuning with GridSearchCV
def classification_models(X,Y, model_params):
    
    results=[]
    index=0
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=33)
    for model_name, mp in model_params.items():
        
        clf=GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=True)        
        
        clf.fit(X_train, Y_train)
        Y_train_pred=clf.predict(X_test)
        print("#########################")
        print("Model:: ",model_name)
        print("Score:: ",clf.best_score_)
        index+=1
        results.append({"Model":model_name,"Score":clf.best_score_,"Params":clf.best_params_})

    return results

In [35]:
results_tuned=classification_models(X,Y,model_parameters)

#########################
Model::  Support Vector Machine
Score::  0.9078595317725753


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Us

#########################
Model::  Random Forest Classifier
Score::  0.9100573339703775
#########################
Model::  XGBoost Classifier
Score::  0.802627806975633


In [36]:
# Results after tuning
results_tuned

[{'Model': 'Support Vector Machine',
  'Score': 0.9078595317725753,
  'Params': {'C': 1, 'kernel': 'linear'}},
 {'Model': 'Random Forest Classifier',
  'Score': 0.9100573339703775,
  'Params': {'criterion': 'gini', 'n_estimators': 20, 'verbose': 1}},
 {'Model': 'XGBoost Classifier',
  'Score': 0.802627806975633,
  'Params': {'eta': 0.5, 'gamma': 0, 'max_depth': 2}}]