In [122]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import en_core_web_sm
nlp = en_core_web_sm.load()
from sklearn.preprocessing import MinMaxScaler

In [123]:
df = pd.read_csv('Poem_classification - train_data.csv', sep = ',')
df = df.dropna()
df.head()

Unnamed: 0,Genre,Poem
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...
5,Music,"for Bob Marley, Bavaria, November 1980 Here i..."


'Music', 'Death', 'Affection', 'Environment'

In [124]:
df['POS_Tagged_Poem'] = df['Poem'].apply(lambda x: nlp(x))
df['POS_Tagged_Poem'] = df['POS_Tagged_Poem'].apply(lambda x: [token.tag_ for token in x])

In [125]:
parts_of_speech = ['_SP', 'IN', 'DT', 'JJ', 'PRP', 'VBP', 'JJS', 'NN', 'TO', 'VB', 'NNS', 'RB', 'VBD', 'RP', ',', 'VBZ', 'MD', 'CD', 'CC', 'VBG', 'NNP', 'HYPH', 'WDT', 'WRB', 'PRP$', 'POS', 'EX', 'RBR', 'PDT', 'VBN', 'WP', 'JJR', 'UH', 'RBS', 'SYM', 'FW', 'LS']
for i in parts_of_speech:
    df[i] = df['POS_Tagged_Poem'].apply(lambda x: x.count(i))
df.head()

Unnamed: 0,Genre,Poem,POS_Tagged_Poem,_SP,IN,DT,JJ,PRP,VBP,JJS,...,RBR,PDT,VBN,WP,JJR,UH,RBS,SYM,FW,LS
1,Music,In the thick brushthey spend the...,"[_SP, IN, DT, JJ, PRP, VBP, DT, JJS, NN, IN, D...",4,4,6,2,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,Music,Storms are generous. ...,"[_SP, NNS, VBP, JJ, ., _SP, NN, RB, JJ, TO, VB...",4,4,2,3,2,2,0,...,0,0,0,0,0,0,0,0,0,0
3,Music,—After Ana Mendieta Did you carry around the ...,"[_SP, :, IN, NNP, NNP, VBD, PRP, VB, RP, DT, N...",1,5,4,1,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Music,for Aja Sherrard at 20The portent may itself ...,"[_SP, IN, NNP, NNP, IN, NNP, NN, MD, PRP, VB, ...",1,7,2,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Music,"for Bob Marley, Bavaria, November 1980 Here i...","[_SP, IN, NNP, NNP, ,, NNP, ,, NNP, CD, RB, VB...",1,5,6,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [126]:
parts_set = set(parts_of_speech)

def calculate_pos_ratios(tags_list):
    total_count = len(tags_list)
    pos_counts = {pos: tags_list.count(pos) / total_count for pos in parts_set}
    return pos_counts

for pos in parts_of_speech:
    df[pos] = df['POS_Tagged_Poem'].apply(lambda x: calculate_pos_ratios(x).get(pos, 0))
    

# Нормализация 
scaler = MinMaxScaler()
df[parts_of_speech] = scaler.fit_transform(df[parts_of_speech])
df.head()

Unnamed: 0,Genre,Poem,POS_Tagged_Poem,_SP,IN,DT,JJ,PRP,VBP,JJS,...,RBR,PDT,VBN,WP,JJR,UH,RBS,SYM,FW,LS
1,Music,In the thick brushthey spend the...,"[_SP, IN, DT, JJ, PRP, VBP, DT, JJS, NN, IN, D...",0.847222,0.5,0.6875,0.21875,0.125,0.177083,0.28125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Music,Storms are generous. ...,"[_SP, NNS, VBP, JJ, ., _SP, NN, RB, JJ, TO, VB...",0.797386,0.470588,0.215686,0.308824,0.235294,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Music,—After Ana Mendieta Did you carry around the ...,"[_SP, :, IN, NNP, NNP, VBD, PRP, VB, RP, DT, N...",0.125514,0.37037,0.271605,0.064815,0.296296,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Music,for Aja Sherrard at 20The portent may itself ...,"[_SP, IN, NNP, NNP, IN, NNP, NN, MD, PRP, VB, ...",0.157623,0.651163,0.170543,0.081395,0.186047,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Music,"for Bob Marley, Bavaria, November 1980 Here i...","[_SP, IN, NNP, NNP, ,, NNP, ,, NNP, CD, RB, VB...",0.15404,0.454545,0.5,0.079545,0.090909,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [127]:
for speech_part in parts_of_speech:
    df[f'{speech_part}_Mean_Position'] = df['POS_Tagged_Poem'].apply(lambda x: np.mean([i for i, val in enumerate(x) if val == speech_part])).fillna(0)

df = df.drop(['POS_Tagged_Poem', 'Poem'], axis=1)
df['Genre'] = df['Genre'].replace({'Music': 1, 'Death': 0, 'Affection': 0, 'Environment': 0})
df.head()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=

Unnamed: 0,Genre,_SP,IN,DT,JJ,PRP,VBP,JJS,NN,TO,...,RBR_Mean_Position,PDT_Mean_Position,VBN_Mean_Position,WP_Mean_Position,JJR_Mean_Position,UH_Mean_Position,RBS_Mean_Position,SYM_Mean_Position,FW_Mean_Position,LS_Mean_Position
1,1,0.847222,0.5,0.6875,0.21875,0.125,0.177083,0.28125,0.612723,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.797386,0.470588,0.215686,0.308824,0.235294,0.333333,0.0,0.192227,0.235294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.125514,0.37037,0.271605,0.064815,0.296296,0.0,0.0,0.443783,0.148148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.157623,0.651163,0.170543,0.081395,0.186047,0.0,0.0,0.303987,0.186047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,0.15404,0.454545,0.5,0.079545,0.090909,0.0,0.0,0.396104,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X = df.drop('Genre', axis=1)
y = df['Genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVC(kernel='linear')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.7321428571428571
