In [39]:
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import * 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [40]:
# Импортируем фрейм данных
data = pd.read_csv("dataset.csv")

# Оставляем только нужных нам исполнитеолей
df1 = data[data['cantorNome'] == 'david-bowie']
df2 = data[data['cantorNome'] == 'paul-mccartney']
data = pd.concat([df1, df2])

# Оставляем только нужные нам колонки
columns = data[['cantorNome', 'letra']]
print(columns)

         cantorNome                                              letra
0       david-bowie  I, I will be king. And you, you will be queen....
1       david-bowie  Didn't know what time it was,. The lights were...
2       david-bowie  Ground control to Major Tom. Ground control to...
3       david-bowie  It's a god-awful small affair. To the girl wit...
4       david-bowie  I know when to go out. And when to stay in. Ge...
..              ...                                                ...
942  paul-mccartney  He's just a young boy looking for a way to fin...
943  paul-mccartney  How can I hope to reach your love. Help me to ...
944  paul-mccartney  I like it. Please don't take my heart away. It...
945  paul-mccartney  Yvonne is the one I´ve been counting on. She s...
946  paul-mccartney                                       Instrumental

[947 rows x 2 columns]


In [41]:
# Приводим все к нижнему регистру
lowered = columns['letra'].str.lower()
columns['lowered'] = lowered

# Токенезируем
tokenizer = RegexpTokenizer(r'\w+')
tokened = columns.apply(lambda row: tokenizer.tokenize(row['lowered']), axis=1)
columns['tokened'] = tokened

# Удаляем "стоп слова"
noise = stopwords.words('english')
withoutstop = columns['tokened'].apply(lambda x: [item for item in x if item not in noise])
without_stop = []

for a in withoutstop:    
    without_stop.append(", ".join(a))
columns['without_stop'] = without_stop

# Делаем леммитизацию
lemmatizer = WordNetLemmatizer()
lemmatized = columns['without_stop'].apply(lambda x: [lemmatizer.lemmatize(x)])
lemma = []

for a in lemmatized:    
    lemma.append(", ".join(a))
columns['lemmatized'] = lemma

# Разделяем фрейм на тренировочные и тестовые данные 80/20
x_train, x_test, y_train, y_test = train_test_split(columns.lemmatized, columns.cantorNome, train_size = 0.8)

# Векторизируем тренировочную выборку
vectorizer = CountVectorizer(ngram_range=(1, 3))
vectorized_x_train = vectorizer.fit_transform(x_train)

# Импортируем байесовский классификатор
clf = MultinomialNB()
clf.fit(vectorized_x_train, y_train)

# Векторизируем тестовую выборку
vectorized_x_test = vectorizer.transform(x_test)

# Покажем насколько хорошо модель предсказала тесты
preg = clf.predict(vectorized_x_test)
print(classification_report(y_test, preg))


                precision    recall  f1-score   support

   david-bowie       0.80      0.66      0.72       102
paul-mccartney       0.67      0.81      0.73        88

      accuracy                           0.73       190
     macro avg       0.73      0.73      0.73       190
  weighted avg       0.74      0.73      0.73       190



In [42]:
# Импортируем две песни, авторов которых мы хотим узнать
data = pd.read_csv("pesni.csv")

# Вся та же самая обработка фрейма
df1 = data[data['cantorNome'] == 'david-bowie'] 
df2 = data[data['cantorNome'] == 'paul-mccartney']
data = pd.concat([df1, df2])

columns = data[['cantorNome', 'letra']]

lowered = columns['letra'].str.lower()
columns['lowered'] = lowered

tokenizer = RegexpTokenizer(r'\w+')
tokened = columns.apply(lambda row: tokenizer.tokenize(row['lowered']), axis=1)
columns['tokened'] = tokened

withoutstop = columns['tokened'].apply(lambda x: [item for item in x if item not in noise])
without_stop = []

for a in withoutstop:    
    without_stop.append(", ".join(a))
columns['without_stop'] = without_stop

lemmatizer = WordNetLemmatizer()
lemmatized = columns['without_stop'].apply(lambda x: [lemmatizer.lemmatize(x)])
lemma = []

for a in lemmatized:    
    lemma.append(", ".join(a))

columns['lemmatized'] = lemma

# X тесты - наши обработанные тексты двух песен, а Y тесты - исполнители, 
# которых мы хотим предсказать, а потом сравнить предсказание с реальностью
x1_test = columns['lemmatized']
y1_test = columns['cantorNome']

# Векторизируем наши тестовые песни
vectorized_x1_test = vectorizer.transform(x1_test)

# Предсказываем авторов
pred = clf.predict(vectorized_x1_test)
print(*pred)   

# Покажем что все правильно
print(classification_report(y1_test, pred))



david-bowie paul-mccartney
                precision    recall  f1-score   support

   david-bowie       1.00      1.00      1.00         1
paul-mccartney       1.00      1.00      1.00         1

      accuracy                           1.00         2
     macro avg       1.00      1.00      1.00         2
  weighted avg       1.00      1.00      1.00         2

