# Preparation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
songs_and_genres = pd.read_csv('songs_and_genres.csv')

Note: I had to rename the .csv file containing the pop songs from "pop_classifier.csv" to "Pop_classifier.csv" and had to change the genre name from lower to upper case, since a pandas dataframe has the function .pop, so when I wanted to refer to the column containing all pop songs, it tried to use the function instead, which lead to me renaming of the genre.

In [61]:
pop = pd.read_csv('Pop_classifier.csv')
pop.genre = 'Pop'
pop.to_csv('Pop_classifier.csv', encoding='utf-8', index=False)

First, a file containing all songs and their accompanying genre must be created.

In [2]:
genres= ['alternative rock','country', 'folk', 'hard rock', 'heavy metal', 'hip hop', 'jazz', 'Pop', 'progressive rock', 'rock', 'soul']

In [3]:
frames = []
for genre in genres:
    genre_file = pd.read_csv(genre + '_classifier.csv')
    frames.append(genre_file)
    

In [4]:
songs_and_genres = pd.concat(frames)

In [5]:
# the new dataframe did not reset the index, so some indices occured multiple times
songs_and_genres = songs_and_genres.reset_index(drop=True)

In [6]:
# in order for me to be able to call on the different genre columns, I had to replace whitespaces with underscores
songs_and_genres.genre = songs_and_genres.genre.replace(' ', '_', regex=True)

In [7]:
songs_and_genres

Unnamed: 0,song,genre
0,Picked girl And sailed around world Why What e...,alternative_rock
1,aint got Nothing scared aint got Nothing scare...,alternative_rock
2,far Ive really stayed touch Well knew much sur...,alternative_rock
3,Sitting SDT Waiting underground train rumble u...,alternative_rock
4,Grandma got new dentures eat crust pizza Been ...,alternative_rock
5,This ballad good times put battery leg Put roc...,alternative_rock
6,Battle Battle Battle Battle Battle Battle Some...,alternative_rock
7,The point looked Has always part brain Now min...,alternative_rock
8,Beetlebum What youve done Shes gun Now youve d...,alternative_rock
9,Bow bells say goodbye last train Over river Ou...,alternative_rock


In [8]:
# the dataframe was restructured to contain the genres as columns and binary values (0 for not a song not belonging to a genre and 1 if it did)
genre_list= ['alternative_rock','country', 'folk', 'hard_rock', 'heavy_metal', 'hip_hop', 'jazz', 'Pop', 'progressive_rock', 'rock', 'soul']
for genre in genre_list:
    songs_and_genres[genre] = 0
    for item, row in songs_and_genres.iterrows():
        if row[1] == genre:
            songs_and_genres.loc[item, genre] = 1

In [9]:
# the genre column was dropped
songs_and_genres = songs_and_genres.drop('genre', axis=1)

In [11]:
songs_and_genres

Unnamed: 0,song,alternative_rock,country,folk,hard_rock,heavy_metal,hip_hop,jazz,Pop,progressive_rock,rock,soul
0,Picked girl And sailed around world Why What e...,1,0,0,0,0,0,0,0,0,0,0
1,aint got Nothing scared aint got Nothing scare...,1,0,0,0,0,0,0,0,0,0,0
2,far Ive really stayed touch Well knew much sur...,1,0,0,0,0,0,0,0,0,0,0
3,Sitting SDT Waiting underground train rumble u...,1,0,0,0,0,0,0,0,0,0,0
4,Grandma got new dentures eat crust pizza Been ...,1,0,0,0,0,0,0,0,0,0,0
5,This ballad good times put battery leg Put roc...,1,0,0,0,0,0,0,0,0,0,0
6,Battle Battle Battle Battle Battle Battle Some...,1,0,0,0,0,0,0,0,0,0,0
7,The point looked Has always part brain Now min...,1,0,0,0,0,0,0,0,0,0,0
8,Beetlebum What youve done Shes gun Now youve d...,1,0,0,0,0,0,0,0,0,0,0
9,Bow bells say goodbye last train Over river Ou...,1,0,0,0,0,0,0,0,0,0,0


In [12]:
songs_and_genres.to_csv('songs_and_genres.csv', encoding='utf-8', index=False)

# Classifier Example

## Splitting into Training and Test Sets

In [61]:
songs_and_genres = pd.read_csv('songs_and_genres.csv')

In [63]:
# splitting the dataframe into training and testing set
train_country, test_country = train_test_split(songs_and_genres, test_size=0.1, random_state=0, stratify=songs_and_genres['country'])
print("\ntrain_country:\n")
print(train_country.head())
print(train_country.shape)

print("\ntest_country:\n")
print(test_country.head())
print(test_country.shape)



train_country:

                                                    song  alternative_rock  \
64     You put together break bones wish never seen h...                 1   
15491  For part never thought could set words even di...                 0   
3042   Ive fame fortune Women come knock door Ive liv...                 0   
19681  Well lights turnin gray nearly every day Come ...                 0   
18109  When youre afraid feels like youre tired fight...                 0   

       country  folk  hard_rock  heavy_metal  hip_hop  jazz  Pop  \
64           0     0          0            0        0     0    0   
15491        0     0          0            0        0     0    0   
3042         1     0          0            0        0     0    0   
19681        0     0          0            0        0     0    0   
18109        0     0          0            0        0     0    0   

       progressive_rock  rock  soul  
64                    0     0     0  
15491                 1     0

In [64]:
# check for number of occurences of each value for the test genre
train_country.country.value_counts()

0    16899
1     2555
Name: country, dtype: int64

## Bootstrapping
Since the ratio between 0s and 1s attributed to each genre was too harsh, with many more 0s occurring for each genre than 1s, the data had to be adjusted. Bootstrapping upscales and downscales the data, so that an equal number of 0s and 1s are taken over to the final classifier. This means that not all songs with the value 0 are taken into account and some songs with the value 1 will appear more than once. This makes for clearer identifiable data in the long run without misleading the classifier too much.

In [70]:
tr_1 = train_country[train_country['country']==0].sample(2000,replace=True)
tr_2 = train_country[train_country['country']==1].sample(2000,replace=True)
ts_1 = test_country[test_country['country']==0].sample(200,replace=True)
ts_2 = test_country[test_country['country']==1].sample(200,replace=True)
training_bs = pd.concat([tr_1, tr_2])
testing_bs = pd.concat([ts_1, ts_2])

In [71]:
training_bs

Unnamed: 0,song,alternative_rock,country,folk,hard_rock,heavy_metal,hip_hop,jazz,Pop,progressive_rock,rock,soul
19072,Now darling know That really really love Altho...,0,0,0,0,0,0,0,0,0,1,0
18371,Well got daddys car And cruised hamburger stan...,0,0,0,0,0,0,0,0,0,1,0
21215,They say heaven ten zillion light years away A...,0,0,0,0,0,0,0,0,0,0,1
418,Waste time running circles Waste time going ba...,1,0,0,0,0,0,0,0,0,0,0
16721,When think Lorelei head turns around gentle bu...,0,0,0,0,0,0,0,0,1,0,0
8861,Back tomb Lucy sitting floor standing door The...,0,0,0,0,1,0,0,0,0,0,0
7699,When hearts young put hands flames Felt heat e...,0,0,0,1,0,0,0,0,0,0,0
14635,Photograph king watches Fashions may change he...,0,0,0,0,0,0,0,1,0,0,0
6018,The name gave Caroline Daughter miner Her ways...,0,0,1,0,0,0,0,0,0,0,0
14696,Silver leaves spinning round Take hand Looking...,0,0,0,0,0,0,0,1,0,0,0


## Tfidfvectorizer

In [83]:
# Tfidvectoricer for the training set
tfidf_vectorizer=TfidfVectorizer(smooth_idf=True,use_idf=True)
tfidf_vectorizer_vectors_train=tfidf_vectorizer.fit_transform(training_bs.song)


In [84]:
tfidf_vectorizer_vectors_test=tfidf_vectorizer.transform(testing_bs.song)

In [85]:
## Example vector
# get the first vector out
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors_train[1]
 
# place tf-idf values in a dataframe
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
fun,0.687936
tbird,0.392569
daddy,0.218752
well,0.143253
took,0.127579
away,0.119735
and,0.114374
shell,0.109589
takes,0.105116
indy,0.102998


In [86]:
# training the classifier
classifier = LogisticRegression(solver='lbfgs')
classifier.fit(tfidf_vectorizer_vectors_train, training_bs['country'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [87]:
# predicting results with test data
pred =  classifier.predict(tfidf_vectorizer_vectors_test)
print('\nConfusion matrix country\n',confusion_matrix(testing_bs['country'],pred))
print(classification_report(testing_bs['country'],pred))


Confusion matrix country
 [[146  54]
 [ 66 134]]
              precision    recall  f1-score   support

           0       0.69      0.73      0.71       200
           1       0.71      0.67      0.69       200

    accuracy                           0.70       400
   macro avg       0.70      0.70      0.70       400
weighted avg       0.70      0.70      0.70       400



# General Classification

In [126]:
def tfidf_classification(genre):
    '''Takes a genre, puts all songs from that genre through the tfidfvectorizer and trains a logistic regression classifier on all songs in the songs_and_genres file plus the binary factor if the song belongs to said genre or not.'''
    ######### Split data into training and test set #########
    train, test = train_test_split(songs_and_genres, test_size=0.1, random_state=0, stratify=songs_and_genres[genre])
    ######### Bootstrapping #########
    tr_1 = train[train[genre]==0].sample(2000,replace=True)
    tr_2 = train[train[genre]==1].sample(2000,replace=True)
    ts_1 = test[test[genre]==0].sample(200,replace=True)
    ts_2 = test[test[genre]==1].sample(200,replace=True)
    training_bs = pd.concat([tr_1, tr_2])
    testing_bs = pd.concat([ts_1, ts_2])
    ######### tfidfvectorizer #########
    # training set #
    tfidf_vectorizer=TfidfVectorizer(smooth_idf=True,use_idf=True)
    tfidf_vectorizer_vectors_train=tfidf_vectorizer.fit_transform(training_bs.song)
    # testing set #
    tfidf_vectorizer_vectors_test=tfidf_vectorizer.transform(testing_bs.song)
    ######### Classification & Evaluation #########
    classifier = LogisticRegression(solver='lbfgs')
    classifier.fit(tfidf_vectorizer_vectors_train, training_bs[genre])
    
    pred =  classifier.predict(tfidf_vectorizer_vectors_test)
    confusion = confusion_matrix(testing_bs[genre], pred)
    classification = classification_report(testing_bs[genre], pred)

    return confusion, classification

In [127]:
genre_list= ['alternative_rock','country', 'folk', 'hard_rock', 'heavy_metal', 'hip_hop', 'jazz', 'Pop', 'progressive_rock', 'rock', 'soul']

In [140]:
# print all genre classification confusion matrices and classification reports
for genre in genre_list:
    confusion, classification = tfidf_classification(genre)
    print(genre.capitalize(), ':\n\n', 'Confusion matrix:\n', confusion, '\n\nClassification report:\n', classification, '\n', '-----------------------------------------------------------------------------')

Alternative_rock :

 Confusion matrix:
 [[156  44]
 [ 71 129]] 

Classification report:
               precision    recall  f1-score   support

           0       0.69      0.78      0.73       200
           1       0.75      0.65      0.69       200

    accuracy                           0.71       400
   macro avg       0.72      0.71      0.71       400
weighted avg       0.72      0.71      0.71       400
 
 -----------------------------------------------------------------------------
Country :

 Confusion matrix:
 [[149  51]
 [ 60 140]] 

Classification report:
               precision    recall  f1-score   support

           0       0.71      0.74      0.73       200
           1       0.73      0.70      0.72       200

    accuracy                           0.72       400
   macro avg       0.72      0.72      0.72       400
weighted avg       0.72      0.72      0.72       400
 
 -----------------------------------------------------------------------------
Folk :

 Confusio