In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Loading Data
data = pd.read_csv('poems.csv')
data.head()

Unnamed: 0,Poem,Genre
0,On moonlit heath and lonesome bank The sheep b...,Death
1,"“Lights out"" along the land, “Lights out” upon...",Death
2,I am a garden of red tulips And late daffodils...,Death
3,"New moon in midheaven, in Libra.The hermit wie...",Death
4,All the hills and vales along Earth is burstin...,Death


In [3]:
data['Genre'].value_counts()

Audio     254
Nature    253
Death     250
Love      242
Name: Genre, dtype: int64

In [4]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stop = stopwords.words('english')
porter_stemmer = PorterStemmer()

# Removing stop words from poems
data['FilterPoem'] = data['Poem'].str.split(' ').apply(lambda x: ' '.join(k for k in x if k not in stop))

# Converting poems to lowercase
data['FilterPoem'] = data['FilterPoem'].apply(lambda x: x.lower())

# Removing punctuations and all digits from poems
filterString = string.punctuation + '“”|”' + string.digits
data['FilterPoem'] = data['FilterPoem'].apply(lambda x: x.translate(str.maketrans(filterString,' '*len(filterString),'')))
data['FilterPoem'] = data['FilterPoem'].replace('\s+', ' ', regex=True)

# Stemming all words in the poems
data['FilterPoem'] = data['FilterPoem'].apply(lambda x: ' '.join([porter_stemmer.stem(y) if y not in stop 
                                                                    else y for y in x.split()]))

data.head()

Unnamed: 0,Poem,Genre,FilterPoem
0,On moonlit heath and lonesome bank The sheep b...,Death,on moonlit heath lonesom bank the sheep besid ...
1,"“Lights out"" along the land, “Lights out” upon...",Death,light out along land light out upon sea the ni...
2,I am a garden of red tulips And late daffodils...,Death,i garden red tulip and late daffodil bay hedg ...
3,"New moon in midheaven, in Libra.The hermit wie...",Death,new moon midheaven libra the hermit wield two ...
4,All the hills and vales along Earth is burstin...,Death,all hill vale along earth burst song and singe...


In [5]:
from wordcloud import WordCloud
from ipywidgets import interact

genre_list = ['Death', 'Audio', 'Nature', 'Love']

@interact
def plot_word_cloud(genre=genre_list):
    
    sample_data = data[data['Genre'] == genre]
    text = ' '.join(sample_data['Poem'].tolist())
    wordcloud = WordCloud(max_font_size=60, stopwords=stop).generate(text)
        
    plt.figure(figsize=(12,12))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title('WordCloud for {}'.format(genre))

interactive(children=(Dropdown(description='genre', options=('Death', 'Audio', 'Nature', 'Love'), value='Death…

In [6]:
X = data['FilterPoem']
y = data['Genre']

In [7]:
# Importing required Libraries
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.30,random_state = 0)

# Vectorization-->
vector = CountVectorizer(stop_words = 'english')

In [9]:
dtm1 = vector.fit_transform(X_train)
dtm2 = vector.transform(X_test)

In [10]:
#n_components = 4 for 4 genres
PCA = KernelPCA(n_components = 4)

dm1 = PCA.fit_transform(dtm1)
dm2 = PCA.transform(dtm2)

In [11]:
# Prediction of classes according to PCA
Y_train = dm1.argmax(axis = 1)
Y_test = dm2.argmax(axis = 1)

In [14]:
LRClassifier = Pipeline([
                ("Tfidf", TfidfVectorizer(min_df = 2,max_df = 0.95)),
                ("ToDense",FunctionTransformer(lambda x: x.todense(), accept_sparse=True, validate=False)),
                ("Classifier",LogisticRegression())
])

LRClassifier.fit(X_train,Y_train)

Pipeline(memory=None,
     steps=[('Tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [15]:
LRPred = LRClassifier.predict(X_test)

print(f"Accuracy (Logistic Regression): {accuracy_score(Y_test,LRPred) * 100}%")

Accuracy (Logistic Regression): 70.33333333333334%


In [None]:
SGDClassifier = Pipeline([
                ("Tfidf", TfidfVectorizer(min_df = 2,max_df = 0.95)),
                ("ToDense",FunctionTransformer(lambda x: x.todense(), accept_sparse=True, validate=False)),
                ("Classifier",SGDClassifier(max_iter=5, tol=None))
])

SGDClassifier.fit(X_train,Y_train)

In [None]:
SGDPred = SGDClassifier.predict(X_test)

print(f'Accuracy (SGDClassifier): {accuracy_score(Y_test,SGDPred) * 100}%')