In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# 1. import data

In [2]:
# train data

poem_train = pd.read_csv('../input/poem-classification-nlp/Poem_classification - train_data.csv')
poem_train

# 841 rows × 2 columns

Unnamed: 0,Genre,Poem
0,Music,
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...
...,...,...
836,Environment,Why make so much of fragmentary blue In here a...
837,Environment,"Woman, I wish I didn't know your name. What co..."
838,Environment,"Yonder to the kiosk, beside the creek, Paddle ..."
839,Environment,You come to fetch me from my work to-night Whe...


In [3]:
poem_train.isnull().sum()

Genre    0
Poem     4
dtype: int64

In [4]:
# remove NA data

poem_train = poem_train.dropna(axis=0)
poem_train.isnull().sum()

Genre    0
Poem     0
dtype: int64

In [5]:
poem_train

# 837 rows × 2 columns

Unnamed: 0,Genre,Poem
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...
5,Music,"for Bob Marley, Bavaria, November 1980 Here i..."
...,...,...
836,Environment,Why make so much of fragmentary blue In here a...
837,Environment,"Woman, I wish I didn't know your name. What co..."
838,Environment,"Yonder to the kiosk, beside the creek, Paddle ..."
839,Environment,You come to fetch me from my work to-night Whe...


In [6]:
# test data

poem_test = pd.read_csv('../input/poem-classification-nlp/Poem_classification - test_data.csv')
poem_test

Unnamed: 0,Genre,Poem
0,Music,A woman walks by the bench I’m sitting onwith ...
1,Music,"Because I am a boy, the untouchability of beau..."
2,Music,"Because today we did not leave this world,We n..."
3,Music,"Big Bend has been here, been here. Shouldn’t i..."
4,Music,"I put shells there, along the lip of the road...."
...,...,...
145,Environment,"To pick a tulip from the garden, the red one. ..."
146,Environment,We are as clouds that veil the midnight moon; ...
147,Environment,"When pulled, the spider web took another form...."
148,Environment,Whose woods these are I think I know. His hous...


In [7]:
poem_test.isnull().sum()

Genre    0
Poem     0
dtype: int64

# 2. split data into X and y

In [8]:
poem_train.Genre.value_counts()

Music          238
Death          231
Environment    227
Affection      141
Name: Genre, dtype: int64

In [9]:
poem_test.Genre.value_counts()

Affection      100
Environment     25
Death           13
Music           12
Name: Genre, dtype: int64

In [10]:
X_train = poem_train['Poem']
X_train

1                    In the thick brushthey spend the...
2         Storms are generous.                       ...
3       —After Ana Mendieta Did you carry around the ...
4       for Aja Sherrard at 20The portent may itself ...
5       for Bob Marley, Bavaria, November 1980 Here i...
                             ...                        
836    Why make so much of fragmentary blue In here a...
837    Woman, I wish I didn't know your name. What co...
838    Yonder to the kiosk, beside the creek, Paddle ...
839    You come to fetch me from my work to-night Whe...
840    You see them through water and glass, (both li...
Name: Poem, Length: 837, dtype: object

In [11]:
y_train = poem_train['Genre']
y_train

1            Music
2            Music
3            Music
4            Music
5            Music
          ...     
836    Environment
837    Environment
838    Environment
839    Environment
840    Environment
Name: Genre, Length: 837, dtype: object

In [12]:
X_test = poem_test.Poem
X_test

0      A woman walks by the bench I’m sitting onwith ...
1      Because I am a boy, the untouchability of beau...
2      Because today we did not leave this world,We n...
3      Big Bend has been here, been here. Shouldn’t i...
4      I put shells there, along the lip of the road....
                             ...                        
145    To pick a tulip from the garden, the red one. ...
146    We are as clouds that veil the midnight moon; ...
147    When pulled, the spider web took another form....
148    Whose woods these are I think I know. His hous...
149    you can make the maples blazejust by stopping ...
Name: Poem, Length: 150, dtype: object

In [13]:
y_test = poem_test.Genre
y_test

0            Music
1            Music
2            Music
3            Music
4            Music
          ...     
145    Environment
146    Environment
147    Environment
148    Environment
149    Environment
Name: Genre, Length: 150, dtype: object

# 3. feature vectorization by Count 

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(stop_words='english', max_df=100) 
# default lowercase = True, analyser = True, ngram_range=(1,1)

count_vector.fit(X_train)
X_train_count_vector = count_vector.transform(X_train)
X_test_count_vector = count_vector.transform(X_test)

X_train_count_vector.shape, X_test_count_vector.shape

((837, 8169), (150, 8169))

In [15]:
X_train_count_vector

<837x8169 sparse matrix of type '<class 'numpy.int64'>'
	with 18814 stored elements in Compressed Sparse Row format>

In [16]:
X_test_count_vector

<150x8169 sparse matrix of type '<class 'numpy.int64'>'
	with 2522 stored elements in Compressed Sparse Row format>

# 4. LogisticRegression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(max_iter=5000)
lr_clf.fit(X_train_count_vector, y_train)
pred = lr_clf.predict(X_test_count_vector)

accuracy_score(y_test, pred)

0.34

In [18]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

In [19]:
# confusion matrix
print(confusion_matrix(y_test, pred))

[[25 23 21 31]
 [ 2  3  5  3]
 [ 1  6 14  4]
 [ 0  0  3  9]]


In [20]:
# classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

   Affection       0.89      0.25      0.39       100
       Death       0.09      0.23      0.13        13
 Environment       0.33      0.56      0.41        25
       Music       0.19      0.75      0.31        12

    accuracy                           0.34       150
   macro avg       0.38      0.45      0.31       150
weighted avg       0.67      0.34      0.37       150



# 5. feature vectorization by TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer(stop_words='english', max_df=100)
# default lowercase = True, analyser = True, ngram_range=(1,1)

tfidf_vector.fit(X_train)

X_train_tfidf_vector = tfidf_vector.transform(X_train)
X_test_tfidf_vector = tfidf_vector.transform(X_test)

X_train_tfidf_vector.shape, X_test_tfidf_vector.shape

((837, 8169), (150, 8169))

In [22]:
lr_clf_2 = LogisticRegression(max_iter=5000)
lr_clf_2.fit(X_train_tfidf_vector, y_train)
pred = lr_clf_2.predict(X_test_tfidf_vector)

accuracy_score(y_test, pred)

0.28

# 6. LR optimization by GridSearchCV  

In [23]:
from sklearn.model_selection import GridSearchCV

params = {'C': [0.01, 0.1, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=10, scoring='accuracy', verbose=1)
grid_cv_lr.fit(X_train_count_vector, y_train)
grid_cv_lr.best_params_

Fitting 10 folds for each of 5 candidates, totalling 50 fits


{'C': 0.1}

In [24]:
pred = grid_cv_lr.predict(X_test_count_vector)
accuracy_score(y_test, pred)

0.3333333333333333

# 7. Pipeline

In [25]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('count_vector', CountVectorizer(stop_words='english', max_df=100)),
                    ('lr_clf', LogisticRegression())])

In [26]:
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

accuracy_score(y_test, pred)

0.34

# 8. SVC

In [27]:
from sklearn.svm import SVC

linearsvc = SVC()

pipeline = Pipeline([('count_vector', CountVectorizer(stop_words='english', max_df=100)),
                    ('linearsvc', SVC(kernel='linear'))])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

accuracy_score(y_test, pred)

0.3333333333333333

# 9. Multinomial Naive Bayes

In [28]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

pipeline = Pipeline([('count_vector', CountVectorizer(stop_words='english', max_df=100)),
                    ('mnb', MultinomialNB())])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

accuracy_score(y_test, pred)


0.2866666666666667

# 10. MLP

In [29]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(10,10,10), max_iter=5000, random_state=2211)

pipeline = Pipeline([('count_vector', CountVectorizer(stop_words='english', max_df=100)),
                    ('mlp', MLPClassifier(hidden_layer_sizes=(10,10,10), max_iter=5000, random_state=2211))])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

accuracy_score(y_test, pred)

0.31333333333333335

> It will continue to be updated. Any comments would be appreciated.