### Text classification: predict which of two artists is more likely to use a given word/sentence
##### 09th of Mai 2022

In [None]:
import requests
import re
from bs4 import BeautifulSoup
from thefuzz import fuzz, process
import pandas as pd
import string
from string import digits
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.linear_model import LogisticRegression
import operator
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Get Data: Download html text from website of a given artist
html = requests.get("https://www.lyrics.com/artist/Gorillaz/476055").text

In [None]:
# Defining a pattern to extract the links to each song
pattern= r'href="(\/lyric.+?)">'

In [None]:
# Create full urls to each song lyrics and sort out dublicates
link = re.findall(pattern, string=html)
url_prefix = "https://www.lyrics.com"
full_url = []					# Will contain full working urls to each lyric
pure_title = []					# Will contain only the song title of each song
dubli_check = []				# Will be used to sort out dublicates
for title in link:
	a = re.findall(r"\/lyric-?l?f?\/\d+\/Gorillaz\/", title)
	b = title.replace(str(a[0]), "")
	if b in dubli_check:
		pass
	else:
		full_url.append(url_prefix+title)
		pure_title.append(b)
		dubli_check.append(b)

In [None]:
# Create a DataFrame containing all urls to the respective title 
df = pd.DataFrame()
df["url"] = full_url
df["title"] = pure_title
df = df.sort_values(by=["title"])
df.reset_index(drop=True, inplace=True)

In [None]:
# Again reduce the number of dublicates by comparing the song titles
pure_title.sort()
pure_title_red = []
c = None
for count, title in enumerate(pure_title):
	if c == None:
		c = title
		pure_title_red.append(title)
	else:
		if fuzz.token_set_ratio(c, title) == 100:
			df = df.drop([count])
		else:
			c = title
			pure_title_red.append(title)

In [None]:
# Reset the index to have a continues index
df.reset_index(drop=True, inplace=True)

In [None]:
### Save HTML files of every song (run only one time!)
# for count, title in enumerate(df["title"]):
# 	html_songs = requests.get(df["url"][count]).text
# 	with open (f"Gorillaz/{title}.txt", "w") as f:
# 	 	f.write(html_songs)

In [None]:
### Create corpus filled with lyrics
gorillaz = []					# Will contain only lyrics from Gorillaz
corpus=[]						# Will contain all lyrics from both artists
for title in df["title"]:
	with open(f"Gorillaz/{title}.txt", "r") as f:
		x = f.read()
	title_soup = BeautifulSoup(x, "html.parser")
	try:
		lyrics = title_soup.find(class_="lyric-body").text
		gorillaz.append(lyrics)
		corpus.append(lyrics)
	except:
		print("No lyrics found for:", title)
		gorillaz.append("Nolyrics")
		corpus.append("Nolyrics")

In [None]:
### Create DataFrame with title, url and lyrics and tokenize 
dic_gor = {"title": pure_title_red, "url": df["url"], "lyrics":gorillaz}
df_gor = pd.DataFrame.from_dict(dic_gor)

In [None]:
#### Performing all previous steps for the second artist (EoDM)
# Get Data: Download html text from website of a given artist
html = requests.get("https://www.lyrics.com/artist/Eagles-of-Death-Metal/643679").text

In [None]:
# Defining a pattern to extract the links to each song
pattern= r'href="(\/lyric.+?)">'

In [None]:
# Create full urls to each song lyrics and sort out dublicates
link = re.findall(pattern, string=html)
url_prefix = "https://www.lyrics.com"
full_url = []					# Will contain full working urls to each lyric
pure_title = []					# Will contain only the song title of each song
dubli_check = []				# Will be used to sort out dublicates
for title in link:
	a = re.findall(r"\/lyric-?l?f?\/\d+\/Eagles\+of\+Death\+Metal\/", title)
	b = title.replace(str(a[0]), "")
	if b in dubli_check:
		pass
	else:
		full_url.append(url_prefix+title)
		pure_title.append(b)
		dubli_check.append(b)

In [None]:
# Create a DataFrame containing all urls to the respective title 
df = pd.DataFrame()
df["url"] = full_url
df["title"] = pure_title
df = df.sort_values(by=["title"])
df.reset_index(drop=True, inplace=True)

In [None]:
# Again reduce the number of dublicates by comparing the song titles
pure_title.sort()
pure_title_red = []
c = None
for count, title in enumerate(pure_title):
	if c == None:
		c = title
		pure_title_red.append(title)
	else:
		if fuzz.token_set_ratio(c, title) == 100:
			df = df.drop([count])
		else:
			c = title
			pure_title_red.append(title)

In [None]:
# Reset the index to have a continues index
df.reset_index(drop=True, inplace=True)

In [None]:
### Save HTML files of every song (run only one time!)
# for count, title in enumerate(df["title"]):
# 	html_songs = requests.get(df["url"][count]).text
# 	with open (f"Eagles_of_Death_Metal/{title}.txt", "w") as f:
# 	 	f.write(html_songs)

In [None]:
### Create corpus filled with lyrics
eodm = []						# Will contain only lyrics from EoDM
for title in df["title"]:
	with open(f"Eagles_of_Death_Metal/{title}.txt", "r") as f:
		x = f.read()
	title_soup = BeautifulSoup(x, "html.parser")
	try:
		lyrics = title_soup.find(class_="lyric-body").text
		eodm.append(lyrics)
		corpus.append(lyrics)
	except:
		print("No lyrics found for:", title)
		eodm.append("Nolyrics")
		corpus.append("Nolyrics")

In [None]:
### Create DataFrame with title, url and lyrics and tokenize
dic_eodm = {"title": pure_title_red, "url": df["url"], "lyrics":eodm}
df_eodm = pd.DataFrame.from_dict(dic_eodm)

In [None]:
# labels will be used for creating the vectorized DataFrame
labels = ["Gorillaz"]*173+["Eagles_of_Death_Metal"]*80			# As many labels as songs needed

In [None]:
# Train the ML model on the lyrics corpus
vectorizer = CountVectorizer(stop_words="english")
vectorizer.fit(corpus)
vecto_trans = vectorizer.transform(corpus)

In [None]:
# Create DataFrame with count of words in each song of the artists
final_df = pd.DataFrame(vecto_trans.toarray(), columns=vectorizer.get_feature_names_out(), index=labels)
final_df.reset_index(inplace=True)
final_df.rename(columns={"index": "band"}, inplace=True)

In [None]:
# Divide data into features (X) and target variable (y)
X = final_df.iloc[:, 1:]
y = final_df.iloc[:, 0]

In [None]:
# OneHotEncode target variable (0 = Gorillaz, 1 = EoDM)
y = np.array(y)
y = y.reshape(-1,1)
ohc = OneHotEncoder(sparse=False, handle_unknown="ignore")
ohc.fit(y)
ohc_t = ohc.transform(y)
y = pd.DataFrame(ohc_t, columns=ohc.get_feature_names_out())
y = y.iloc[:,0]

In [None]:
# Train-test-split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Use a RandomForestClassifier as a ML model
forest = RandomForestClassifier(random_state=1)
forest.fit(X_train.astype(str), y_train)
pred = forest.predict(X_test.astype(str))
print("Accuracy score for Random Forest:", accuracy_score(y_test, pred))
print("Precision score for Random Forest:",precision_score(y_test, pred))
print("Recall score for Random Forest:",recall_score(y_test, pred))

In [None]:
# Use LogisticRegression as a ML model
reg = LogisticRegression(random_state=1)
reg.fit(X_train.astype(str), y_train)
pred = reg.predict(X_test.astype(str))
print("Accuracy score for LogReg:", accuracy_score(y_test, pred))
print("Precision score for LogReg:",precision_score(y_test, pred))
print("Recall score for LogReg:",recall_score(y_test, pred))

In [None]:
# Use Naive Bayes as a ML model
naive = MultinomialNB()
naive.fit(X_train.astype(str), y_train)
naive.score(X_train.astype(str), y_train)
pred = naive.predict(X_test.astype(str))
print("Accuracy score for Naive Bayes:", accuracy_score(y_test, pred))
print("Precision score for Naive Bayes:",precision_score(y_test, pred))
print("Recall score for Naive Bayes:",recall_score(y_test, pred))

In [None]:
# Return most predicitve words based on LogisticRegression
print("Most predictive words for EoDM are:", operator.itemgetter(*np.argsort(reg.coef_[0]))(vectorizer.get_feature_names_out())[-20:])
print("Most predictive words for Gorillaz are:", operator.itemgetter(*np.argsort(reg.coef_[0]))(vectorizer.get_feature_names_out())[:20])

In [None]:
# Lets the user enter a word or sentence and predicts from which artist it probably is
song_input=[]
song_input.append(input())
vecto_trans = vectorizer.transform(song_input)
vector_input_df = pd.DataFrame(vecto_trans.toarray(), columns=vectorizer.get_feature_names_out())
pred_input = naive.predict(vector_input_df.astype(str))
if pred_input == 0:
    print(f"The lyrics {song_input} are probably from the Gorillaz!")
    print("Probability:", naive.predict_proba(vector_input_df.astype(str))[:,0])

else:
    print(f"The lyrics {song_input} are probably from the Eagles of Death Metal!")
    print("Probability:", naive.predict_proba(vector_input_df.astype(str))[:,1])