In [None]:
!pip install Keras-Preprocessing
!!pip install keras-tuner --upgrade



['Collecting keras-tuner',
 '  Downloading keras_tuner-1.4.6-py3-none-any.whl (128 kB)',
 '\x1b[?25l     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m0.0/128.9 kB\x1b[0m \x1b[31m?\x1b[0m eta \x1b[36m-:--:--\x1b[0m',
 '\x1b[2K     \x1b[91m━━━\x1b[0m\x1b[90m╺\x1b[0m\x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m10.2/128.9 kB\x1b[0m \x1b[31m?\x1b[0m eta \x1b[36m-:--:--\x1b[0m',
 '\x1b[2K     \x1b[91m━━━━━━━━━━━━\x1b[0m\x1b[91m╸\x1b[0m\x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m41.0/128.9 kB\x1b[0m \x1b[31m526.3 kB/s\x1b[0m eta \x1b[36m0:00:01\x1b[0m',
 '\x1b[2K     \x1b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m\x1b[91m╸\x1b[0m\x1b[90m━━━━━━━━━━━\x1b[0m \x1b[32m92.2/128.9 kB\x1b[0m \x1b[31m892.2 kB/s\x1b[0m eta \x1b[36m0:00:01\x1b[0m',
 '\x1b[2K     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m128.9/128.9 kB\x1b[0m \x1b[31m1.1 MB/s\x1b[0m eta \x1b[36m0:00:00\x1b[0m',
 'Collecting kt-legacy (from keras-tuner)',
 '  Downloading kt_legacy

In [30]:
# import libraries
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

In [32]:
# import data and visualize the first 5 rows
poems_df = pd.read_csv('https://raw.githubusercontent.com/BhavanishDhamnaskar/poem_d/main/Poem_classification%20-%20train_data.csv')
poems_df.head()

Unnamed: 0,Genre,Poem
0,Music,
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...


In [33]:
print("Dataset size: ", len(poems_df))
print("Dataset distinct labels: ", set(poems_df['Genre']))

poems_df = poems_df.dropna()

Dataset size:  841
Dataset distinct labels:  {'Environment', 'Death', 'Music', 'Affection'}


In [34]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [40]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [41]:
poems = poems_df['Poem'].to_list()

cleaned_poems_tokens = []
cleaned_poems = []

for poem in poems:
    # tokenize
    tokens = word_tokenize(poem)

    #remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    #stemming
    stemmer = SnowballStemmer('english')
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    # remove digits and punctuation
    cleaned_tokens = [token for token in stemmed_tokens if not token.isdigit()
                      and not token in string.punctuation
                      and not token == '’']

    # convert all tokens to lowercase
    lowercase_tokens = [token.lower() for token in cleaned_tokens]

    cleaned_poems_tokens.append(lowercase_tokens)
    cleaned_poems.append(' '.join(lowercase_tokens))

In [42]:
print(f"Token before cleaning {poems[0]}")
print(f"Token after cleaning {cleaned_poems[0]}")

Token before cleaning               In the thick brushthey spend the hottest part of the day,              soaking their hoovesin the trickle of mountain water              the ravine hoardson behalf of the oleander.           
Token after cleaning thick brushthey spend hottest part day soak hoovesin trickl mountain water ravin hoardson behalf oleand


# Bag of words

In [43]:
# implementation using sklearn
countvectorizer = CountVectorizer()

X = countvectorizer.fit_transform(cleaned_poems)
print("Tokens: ", countvectorizer.get_feature_names_out())
print("First poem: ", X[0, :].toarray())

Tokens:  ['00' '000' '10maggi' ... 'zithers' 'zoom' 'ȟe']
First poem:  [[0 0 0 ... 0 0 0]]


In [44]:
# manual implementation

# any token is added to the the set features_names
unique_words = set(word for poem in cleaned_poems for word in poem.split(' '))
word_indices = {word: idx for idx, word in enumerate(unique_words)}

N = len(cleaned_poems)

# we create a matrix with dimensions len(cleaned_poems) x len(unique_words)
n = len(unique_words)
occurence_matrix = np.zeros(shape=[N, n])

# iterate through sentences
for i, poem in enumerate(cleaned_poems):
    #iterate thorugh words
    for word in poem.split(' '):
        # get index of word -> go to the ith row and word index -> add 1 occurence
        occurence_matrix[i][word_indices.get(word)] += 1

# TF IDF

In [45]:
tfidfvectorizer = TfidfVectorizer()

X = tfidfvectorizer.fit_transform(cleaned_poems)
print("Tokens: ", tfidfvectorizer.get_feature_names_out())
print("First poem: ", X[0, :].toarray())

Tokens:  ['00' '000' '10maggi' ... 'zithers' 'zoom' 'ȟe']
First poem:  [[0. 0. 0. ... 0. 0. 0.]]


In [46]:
N = len(cleaned_poems)

unique_words = set(x for string in cleaned_poems for x in string.split(' '))
word_indices = {word: idx for idx,word in enumerate(unique_words)}

n = len(unique_words)
tfidf = np.zeros(shape =(N, n))

idf_dict = {}

for i, doc in enumerate(cleaned_poems):
    for word in doc.split(' '):
        tf = sum([1
                  for x in doc.split(' ')
                  if x==word ]) / len(doc.split(' '))
        if word not in idf_dict.keys():
            idf = np.log(N / (sum([1
                                   for doc in cleaned_poems
                                   if word in doc]) ))
            idf_dict[word] = idf
        else:
            idf = idf_dict.get(word)
        tfidf[i][word_indices.get(word)] = tf * idf

# Word Embeddings

In [47]:
word2vec_model = Word2Vec(sentences=cleaned_poems_tokens, min_count=1,vector_size=256,workers=4)

In [48]:
# get vector for the word day
word2vec_model.wv['day']

array([-2.1482459e-03, -7.1942899e-04,  2.1145211e-03,  2.7396015e-03,
        8.3884336e-03,  1.0802037e-03,  3.8298909e-04, -1.2815304e-03,
        1.8983154e-03,  5.8326148e-03,  1.5114982e-03,  1.0367922e-03,
        1.2336056e-03, -5.3333249e-03, -1.4277099e-03,  2.4967007e-03,
        5.3085055e-04,  5.0373393e-04, -7.4451328e-03,  8.6177271e-03,
       -1.2572737e-03, -3.3987197e-03, -2.8812063e-03, -5.1330547e-03,
       -7.8817615e-03,  1.6567918e-03, -2.5350952e-03, -6.8371417e-04,
       -3.7743626e-03,  1.4064010e-04,  4.7581038e-03,  1.5525005e-03,
       -2.4222052e-03, -6.8619923e-04,  2.7890790e-03,  5.4956409e-03,
       -2.5214109e-04, -4.0099216e-03,  1.5505055e-03, -9.7555853e-04,
       -6.3386597e-03,  4.5747431e-03, -3.6611338e-03, -4.6302588e-03,
        6.8666809e-03, -3.4133403e-03,  2.1120028e-03,  1.5128995e-03,
        1.0669096e-03,  4.2499760e-03,  2.0155229e-04,  4.4883075e-03,
        3.2189898e-03, -1.0341991e-03, -6.4875763e-03,  4.1197415e-04,
      

In [49]:
word2vec_model.wv.most_similar('day')

[('like', 0.6322932839393616),
 ("'s", 0.6037131547927856),
 ('one', 0.6028529405593872),
 ('thing', 0.5863050222396851),
 ('us', 0.5651578903198242),
 ('leav', 0.5567163228988647),
 ('air', 0.5560057163238525),
 ('water', 0.5543666481971741),
 ('dark', 0.5540421009063721),
 ('tree', 0.5534077286720276)]