In [1]:
# Data Manipulation
#-----------------------------
import pandas as pd

# Sci-kit learn packages
#-----------------------------
# Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

# Modelling
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, precision_score


# Other packages
#-----------------------------
import pickle


# Import the dataframe

In [5]:
df_lyrics = pd.read_csv('./Dataset/clean_lyrics.csv', keep_default_na = False)
df_lyrics.head()

Unnamed: 0,SName,Lyric,Artist,Genre,multiple_letter,Trails,lyrics_clean,Rock,Pop,Hip_hop
0,World So Cold,"It starts with pain, followed by hate. Fueled ...",12 Stones,0,0.0,0,start pain follow hate fuel endless question o...,1,0,1
1,Broken,Freedom!. Alone again again alone. Patiently w...,12 Stones,0,0.0,0,freedom alon alon patient wait phone hope call...,1,1,0
2,3 Leaf Loser,"Biting the hand that feeds you, lying to the v...",12 Stones,0,0.0,0,bite hand feed lie voic insid reach beg someth...,1,1,0
3,Anthem For The Underdog,You say you know just who I am. But you can't ...,12 Stones,0,0.0,2,say know imagin wait across line thought still...,1,0,0
4,Adrenaline,My heart is beating faster can't control these...,12 Stones,0,0.007042,0,heart beat faster control feel anymor wait lon...,0,0,0


# Prepping the data for modelling

## Word Vectorizer Instantiation

In [100]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 1000)

## Function Declaration

In [101]:
def Enriched_prep(features, model = None):
        
    assert(type(features)==list), "Please input your desired features for enrichment in a LIST format (even if only one\
    feature is being invoked!)"
    
    X = df_lyrics[features + ['lyrics_clean']]
    y = df_lyrics['Genre']
    
    X_train, X_test, y_train, y_test = train_test_split(X.copy(), y.copy(), test_size = 0.2, random_state = 42, stratify=y)
    
    
    if model == 'logit' and ('Trails' in features):
        scaler = MinMaxScaler()
        
        X_train['Trails'] = scaler.fit_transform(X_train['Trails'].values.reshape(-1, 1))
        X_test['Trails'] = scaler.transform(X_test['Trails'].values.reshape(-1, 1))
        
    lyrics_to_vec_train = vectorizer.fit_transform(X_train.lyrics_clean)
    lyrics_to_vec_test = vectorizer.transform(X_test.lyrics_clean)
    
    lyrics_to_vec_train = pd.DataFrame(lyrics_to_vec_train.toarray(), index = X_train.index)
    lyrics_to_vec_test = pd.DataFrame(lyrics_to_vec_test.toarray(), index = X_test.index)

    
    X_train = pd.concat([lyrics_to_vec_train, X_train[features]], axis=1)
    X_test = pd.concat([lyrics_to_vec_test, X_test[features]], axis=1)
    
    return X_train, X_test, y_train, y_test

In [None]:
features = ['Trails', 'Rock', 'Pop', 'Hip_hop']
X_train, X_test, y_train, y_test = Enriched_prep(features)

In [67]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,994,995,996,997,998,999,Trails,Rock,Pop,Hip_hop
78069,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
61432,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,74,0,0,1
67672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40523,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
45104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [87]:
X_train.rename(columns=dict(zip(X_train.columns[:-4], X_train.columns[:-4].map(str))), inplace=True)

X_test.rename(columns=dict(zip(X_test.columns[:-4], X_test.columns[:-4].map(str))), inplace=True)

We instantiate the random forest classifier.

In [72]:
rf = RandomForestClassifier()

# Modelling
## Grid Search

We perform a grid search to find the best parameters.

In [75]:
rf_params = {
    'n_estimators' : [100, 200, 300],
    'max_depth' : [None,3,5,7],
}

In [88]:
gs = GridSearchCV(rf, param_grid = rf_params, cv = 5)
gs.fit(X_train,y_train)

print(gs.best_score_)

0.7556661199506893


In [89]:
gs.best_params_

{'max_depth': None, 'n_estimators': 200}

## Random forest

We can now run the random forest algorithm using the best parameters.

In [90]:
rf_model = RandomForestClassifier(n_estimators = 200, max_depth = None) # Object instantiation, using the best parameters

rf_model.fit(X_train, y_train) # Model fitting

In [92]:
rf_model.score(X_train, y_train) # Training accuracy

0.9967577436414001

In [93]:
rf_model.score(X_test, y_test) # Testing accuracy

0.7545483160214038

The high accuracy on the training set but not the testing set indicates overfitting.

## Save model

In [94]:
# save the model to disk
filename = 'randomForest_model.sav'
pickle.dump(rf_model, open(filename, 'wb'))