In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Ignoring deprecation warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading the csv
data = pd.read_csv('poem_dataset.csv')
data.head()

Unnamed: 0,poem,theme
0,"I was sympathetic to language, but often it sh...",Love
1,"When, at the end, the children wanted to add ...",Love
2,turns out there are more planets than stars mo...,Love
3,We walk through clouds wrapped in ancient symb...,Love
4,Love me stupid. Love me terrible. And when I a...,Love


In [3]:
# Counting no. of data for each genre
data['theme'].value_counts()

Love        367
Nature      362
Death       336
Identity    292
Name: theme, dtype: int64

In [4]:
import string
from nltk.corpus import stopwords
from textblob import Word

stop = stopwords.words('english')
newStopWord = ['like','yet']
stop.extend(newStopWord)

# Removing stop words from poems
data['FilterPoem'] = data['poem'].str.split(' ').apply(lambda x: ' '.join(k for k in x if k not in stop))

# Converting poems to lowercase
data['FilterPoem'] = data['FilterPoem'].apply(lambda x: x.lower())

# Removing punctuations and all digits from poems
filterString = string.punctuation + '“”|”' + string.digits
data['FilterPoem'] = data['FilterPoem'].apply(lambda x: x.translate(str.maketrans(filterString,' '*len(filterString),'')))
data['FilterPoem'] = data['FilterPoem'].replace('\s+', ' ', regex=True)

# Lemmatizing all words in the poems
data['FilterPoem'] = data['FilterPoem'].apply(lambda x: "".join([Word(word).lemmatize() for word in x]))


data.head()

Unnamed: 0,poem,theme,FilterPoem
0,"I was sympathetic to language, but often it sh...",Love,i sympathetic language often shrugged kept lov...
1,"When, at the end, the children wanted to add ...",Love,when end children wanted add glitter valentine...
2,turns out there are more planets than stars mo...,Love,turns planets stars places land burned i alway...
3,We walk through clouds wrapped in ancient symb...,Love,we walk clouds wrapped ancient symbols we desc...
4,Love me stupid. Love me terrible. And when I a...,Love,love stupid love terrible and i mountain rathe...


In [5]:
from wordcloud import WordCloud
from ipywidgets import interact

theme_list = ['Love', 'Nature', 'Death', 'Identity']

# Generating wordcloud for visualization of highly recurring words for each theme
@interact
def plot_word_cloud(theme=theme_list):
    
    sample_data = data[data['theme'] == theme]
    text = ' '.join(sample_data['FilterPoem'].tolist())
    wordcloud = WordCloud(max_font_size=60, stopwords=stop).generate(text)
        
    plt.figure(figsize=(12,12))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title('WordCloud for {}'.format(theme))

interactive(children=(Dropdown(description='theme', options=('Love', 'Nature', 'Death', 'Identity'), value='Lo…

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
vector = CountVectorizer(min_df = 2,max_df = 0.95, stop_words=stop)

# Defining X and y for the data
X = data['FilterPoem']
y = data['theme']

# Generating train and test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.30,random_state = 0)

X_train_dtm = vector.fit_transform(X_train)
X_test_dtm = vector.transform(X_test)

In [8]:
# Decomposing the matrix for reducing high dimensionality
PCA = KernelPCA(n_components = 4)

X_train_dm = PCA.fit_transform(X_train_dtm)
X_test_dm = PCA.transform(X_test_dtm)

In [9]:
Y_train = X_train_dm.argmax(axis = 1)
Y_test = X_test_dm.argmax(axis = 1)

In [10]:
# Defining Logistic Regression classifier
LRClassifier = make_pipeline(TfidfVectorizer(min_df = 2,max_df = 0.95),
                             FunctionTransformer(lambda x: x.todense(), accept_sparse=True, validate=False),
                             LogisticRegression())

LRClassifier.fit(X_train,Y_train)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smoot...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [11]:
# Predicting the values using Logistic Regression
LRPred = LRClassifier.predict(X_test)

print("Accuracy (Logistic Regression): {}%".format(accuracy_score(Y_test,LRPred) * 100))

Accuracy (Logistic Regression): 65.19607843137256%


In [12]:
# Defining Bernoulli Naive Bayes
BNBClassifier = make_pipeline(TfidfVectorizer(min_df = 2,max_df = 0.95),
                              FunctionTransformer(lambda x: x.todense(), accept_sparse=True, validate=False),
                              BernoulliNB())

BNBClassifier.fit(X_train,Y_train)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smoot...te=False)), ('bernoullinb', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))])

In [13]:
# Predicting the values using Bernoulli Naive Bayes
BNBPred = BNBClassifier.predict(X_test)

print("Accuracy (Bernoulli Naive Bayes): {}%".format(accuracy_score(Y_test,BNBPred) * 100))

Accuracy (Bernoulli Naive Bayes): 71.07843137254902%
