In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Loading Data
data = pd.read_csv('poems.csv')

In [3]:
data = data.sample(frac=1)
data.head()

Unnamed: 0,Poem,Genre
998,All in green went my love ridingon a great hor...,Love
668,Who would decry instruments— when grasses ever...,Nature
104,"Listen, children: Your father is dead. From hi...",Death
967,"Dear pretty youth, unveil your eyes, How can y...",Love
337,makes me think plurality. Maybe I can love you...,Audio


In [4]:
data['Genre'].value_counts()

Audio     254
Nature    253
Death     250
Love      242
Name: Genre, dtype: int64

In [5]:
import nltk
import string

# Function for removing punctuations from poems
def rem_punc(text):
    punctuation = string.punctuation + '“”|”|,'
    for i in punctuation:
        text = text.replace(i,"")
    return text

def to_lower(text):
    return text.lower()

# Converting poems to lowercase
data['Poem'] = data['Poem'].apply(to_lower)

# Removing punctuations from poems
data['Poem'] = data['Poem'].apply(rem_punc)

data.head()

Unnamed: 0,Poem,Genre
998,all in green went my love ridingon a great hor...,Love
668,who would decry instruments— when grasses ever...,Nature
104,listen children your father is dead from his o...,Death
967,dear pretty youth unveil your eyes how can you...,Love
337,makes me think plurality maybe i can love you ...,Audio


In [6]:
from wordcloud import WordCloud
from ipywidgets import interact
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

genre_list = ['Death', 'Audio', 'Nature', 'Love']

@interact
def plot_word_cloud(genre=genre_list):
    
    sample_data = data[data['Genre'] == genre]
    text = ' '.join(sample_data['Poem'].tolist())
    wordcloud = WordCloud(max_font_size=60, stopwords=ENGLISH_STOP_WORDS).generate(text)
        
    plt.figure(figsize=(12,12))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title('WordCloud for {}'.format(genre))

interactive(children=(Dropdown(description='genre', options=('Death', 'Audio', 'Nature', 'Love'), value='Death…

In [7]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [8]:
X = data['Poem'] #Column for Feature
y = data['Genre'] #Column for Prediction

In [9]:
for stop_words in [None, "english"]:

    vect = CountVectorizer(ngram_range=(1,1), stop_words=stop_words)
    data_dtm = vect.fit_transform(X)
    print('The shape of our dtm with stop_words={} is: {}'.format(stop_words, data_dtm.shape))

The shape of our dtm with stop_words=None is: (999, 10510)
The shape of our dtm with stop_words=english is: (999, 10240)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
vect = CountVectorizer(ngram_range=(1,2), stop_words=ENGLISH_STOP_WORDS)

X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

clf = MultinomialNB()

clf.fit(X_train_dtm, y_train)
y_pred = clf.predict(X_test_dtm)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       Audio       0.49      0.25      0.33        76
       Death       0.33      0.53      0.41        53
        Love       0.38      0.34      0.36        62
      Nature       0.38      0.46      0.42        59

   micro avg       0.38      0.38      0.38       250
   macro avg       0.39      0.39      0.38       250
weighted avg       0.40      0.38      0.37       250



In [12]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

lancast_stemmer = LancasterStemmer()
porter_stemmer = PorterStemmer()

In [13]:
data['Lyrics_Lancast_Stem'] = data['Poem'].apply(lambda x: ' '.join([lancast_stemmer.stem(y) if y not in ENGLISH_STOP_WORDS 
                                                                     else y for y in x.split()]))
data['Lyrics_Porter_Stem'] = data['Poem'].apply(lambda x: ' '.join([porter_stemmer.stem(y) if y not in ENGLISH_STOP_WORDS 
                                                                    else y for y in x.split()]))

In [18]:
print('Lancaster Stemming: {}\nPorter Stemming: {}'.format(data.loc[1, 'Lyrics_Lancast_Stem'], data.loc[1, 'Lyrics_Porter_Stem']))

Lancaster Stemming: light out along the land light out upon the sea the night must put her hid hand o’er peac town where childr sleep and peac ship that dark creep across the wav as if they were not freeth dragon of the air the hellhound of the deep lurk and prowl everywhere go for to seek their helpless prey not
Porter Stemming: light out along the land light out upon the sea the night must put her hide hand o’er peac town where children sleep and peac ship that darkli creep across the wave as if they were not freeth dragon of the air the hellhound of the deep lurk and prowl everywhere go forth to seek their helpless prey not


In [19]:
lancast_vect = CountVectorizer(ngram_range=(1,1), stop_words=ENGLISH_STOP_WORDS)
porter_vect = CountVectorizer(ngram_range=(1,1), stop_words=ENGLISH_STOP_WORDS)

data_dtm_lancast = lancast_vect.fit_transform(data['Lyrics_Lancast_Stem'])
data_dtm_porter = porter_vect.fit_transform(data['Lyrics_Porter_Stem'])

print('The shape of our dtm after lancaster stemming is: {}'.format(data_dtm_lancast.shape))
print('The shape of our dtm after porter stemming is: {}'.format(data_dtm_porter.shape))

The shape of our dtm after lancaster stemming is: (999, 7851)
The shape of our dtm after porter stemming is: (999, 8558)


In [20]:

X = data['Lyrics_Lancast_Stem']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=1)

pipe = make_pipeline(CountVectorizer(stop_words=ENGLISH_STOP_WORDS), MultinomialNB())

params = {
    'countvectorizer__ngram_range':[(1,1), (1,2), (1,3)],
    'countvectorizer__min_df':[0, 2, 3, 4, 5, 6, 7, 8]
}

grid = GridSearchCV(pipe, params, cv=5)
grid.fit(X_train, y_train)
print("Best CV Score: {:.2f}".format(grid.best_score_))
print("Best parameters: {}".format(grid.best_params_))
print("Score for test sample held out: {}".format(accuracy_score(y_test, grid.predict(X_test))))

Best CV Score: 0.42
Best parameters: {'countvectorizer__min_df': 0, 'countvectorizer__ngram_range': (1, 1)}
Score for test sample held out: 0.39




In [21]:
X = data['Lyrics_Porter_Stem']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=1)

pipe = make_pipeline(CountVectorizer(stop_words=ENGLISH_STOP_WORDS), MultinomialNB())

params = {
    'countvectorizer__ngram_range':[(1,1), (1,2), (1,3)],
    'countvectorizer__min_df':[0, 2, 3, 4, 5, 6, 7, 8]
}

grid = GridSearchCV(pipe, params, cv=5)
grid.fit(X_train, y_train)
print("Best CV Score: {:.2f}".format(grid.best_score_))
print("Best parameters: {}".format(grid.best_params_))
print("Score for test sample held out: {}".format(accuracy_score(y_test, grid.predict(X_test))))

Best CV Score: 0.44
Best parameters: {'countvectorizer__min_df': 5, 'countvectorizer__ngram_range': (1, 1)}
Score for test sample held out: 0.38
