In [2]:
# Data Manipulation
#-----------------------------
import pandas as pd


# Sci-kit learn packages
#-----------------------------
# Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

# Modelling
from sklearn.linear_model import LogisticRegression

# Metrics
import sklearn.metrics
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, precision_score


# Other packages
#-----------------------------
from time import time #Time algorithms
import pickle # Save or upload models
import multiprocessing
print(f'Your number of available CPU workers is: {multiprocessing.cpu_count()}')

Your number of available CPU workers is: 4


As in the [previous notebook](./2%20-%20Logistic%20Regression%201%20-%20Model%20selection.ipynb) , we ignore all warnings for better readability (only after the code has been tested).

In [3]:
import warnings
warnings.filterwarnings("ignore")


# Import the dataframe

In [4]:
df_lyrics = pd.read_csv('./Dataset/clean_lyrics.csv', keep_default_na = False)

In [4]:
df_lyrics.head()

Unnamed: 0,SName,Lyric,Artist,Genre,multiple_letter,Trails,lyrics_clean,Rock,Pop,Hip_hop
0,World So Cold,"It starts with pain, followed by hate. Fueled ...",12 Stones,0,0.0,0,start pain follow hate fuel endless question o...,1,0,1
1,Broken,Freedom!. Alone again again alone. Patiently w...,12 Stones,0,0.0,0,freedom alon alon patient wait phone hope call...,1,1,0
2,3 Leaf Loser,"Biting the hand that feeds you, lying to the v...",12 Stones,0,0.0,0,bite hand feed lie voic insid reach beg someth...,1,1,0
3,Anthem For The Underdog,You say you know just who I am. But you can't ...,12 Stones,0,0.0,2,say know imagin wait across line thought still...,1,0,0
4,Adrenaline,My heart is beating faster can't control these...,12 Stones,0,0.007042,0,heart beat faster control feel anymor wait lon...,0,0,0


# Prepping the data for modelling

## Word Vectorizer Instantiation

In [5]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 1000)

## Function Declaration

In [6]:
def apr(y_pred, y_real):
    
    '''This function takes in the real and predicted values of the target.
    It returns the following performance metrics: accuracy, precision, recall and F1-score'''
    
    accuracy = metrics.accuracy_score(y_real, y_pred) ## From sklearn, get the accuracy
    precision = metrics.precision_score(y_real, y_pred, average='weighted') ## precision
    recall = metrics.recall_score(y_real, y_pred, average='weighted') ## recall
    f1 = metrics.f1_score(y_real, y_pred, average='weighted') ## F1
    
    ''' Print them into a readable format '''
    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f'F1:{f1}')
    
    return accuracy, precision, recall, f1 ## return them in case we need to save them in non-local variables

In [7]:
def Enriched_prep(features):
        
    assert(type(features)==list), "Please input your desired features for enrichment in a LIST format (even if only one\
    feature is being invoked!)"
    
    X = df_lyrics[features + ['lyrics_clean']]
    y = df_lyrics[['Genre']]
    
    X_train, X_test, y_train, y_test = train_test_split(X.copy(), y.copy(), test_size = 0.2, random_state = 42, stratify=y)
    
    
    if 'Trails' in features:
        scaler = MinMaxScaler()
        
        X_train['Trails'] = scaler.fit_transform(X_train['Trails'].values.reshape(-1, 1))
        X_test['Trails'] = scaler.transform(X_test['Trails'].values.reshape(-1, 1))
        
    lyrics_to_vec_train = vectorizer.fit_transform(X_train.lyrics_clean)
    lyrics_to_vec_test = vectorizer.transform(X_test.lyrics_clean)
    
    lyrics_to_vec_train = pd.DataFrame(lyrics_to_vec_train.toarray(), index = X_train.index)
    lyrics_to_vec_test = pd.DataFrame(lyrics_to_vec_test.toarray(),  index = X_test.index)
    
    X_train = pd.concat([lyrics_to_vec_train, X_train[features]], axis=1)
    X_test = pd.concat([lyrics_to_vec_test, X_test[features]], axis=1)
    
    return X_train, X_test, y_train, y_test

In [8]:
def Enriched_logit_reg(features):

    now = time()
    logreg = LogisticRegression(max_iter=200, solver='newton-cg',penalty='l2',
                               n_jobs = multiprocessing.cpu_count()-1, C=1)
    logreg.fit(X_train, y_train)
    print("Time elapsed in seconds: {}".format(time()-now), "\n")
    
    #print(apr(logreg.predict(X_test), y_test))
    
    return logreg

## Modelling

### Testing and training set

In [9]:
features = ['Trails', 'Rock', 'Pop', 'Hip_hop']
X_train, X_test, y_train, y_test = Enriched_prep(features)

### Logistic Regression

We perform a Logistic Regression based on the best parameters found in the [Model selection](./2%20-%20Logistic%20Regression%201%20-%20Model%20selection.ipynb) notebook, that is:
 - Data prepped using the Snowball Stemmer
 - Newton solver
 - L2 Penalty

In [None]:
model = Enriched_logit_reg(features)

We save the model

In [12]:
filename='logit_model.sav'

# Save the model
#pickle.dump(model, open(filename, 'wb'))

# Load the model
model = pickle.load(open(filename, 'rb'))

### Predictions

In [14]:
y_train['pred'] = model.predict(X_train) # Prediction for the training set
y_test['pred'] = model.predict(X_test) # Prediction for the testing set

all_pred = pd.concat([y_test['pred'], y_train['pred']], names = 'pred_logit') # Concatenate the predictions together

# Add all predictions to the main dataframe
df_lyrics = pd.merge(df_lyrics, all_pred, how='inner', left_index = True, right_index = True)

### Metrics

In [15]:
model.score(X_test,y_test['pred'])

1.0

In [16]:
model.score(X_train,y_train['pred'])

1.0

# Error Analysis

In [32]:
df_lyrics.loc[df_lyrics.pred != df_lyrics.Genre, 'Genre'].value_counts()

2    12558
0     3684
1     2945
Name: Genre, dtype: int64

Most misclassified songs are Pop songs.

In [33]:
missclassified = df_lyrics.loc[df_lyrics.pred != df_lyrics.Genre, 'Genre'].count()
missclassified_pop = df_lyrics.loc[df_lyrics.pred != df_lyrics.Genre, 'Genre'].value_counts()[2]
print(f'{missclassified/df_lyrics.shape[0]:.0%} of the songs were misclassified.')

print(f'Among the missclassified songs, {missclassified_pop/missclassified:.0%} were pop songs.')

24% of the songs were misclassified.
Among the missclassified songs, 65% were pop songs.
