In [7]:
import pandas as pd
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.metrics import make_scorer, accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from string import punctuation
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [2]:
SAMPLE = pd.read_csv('SAMPLE_out.csv')
SAMPLE.head()

Unnamed: 0.1,Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,0,December 4th,rap,JAY-Z,2003,283714,{},produced blaze intro gloria carter shawn carte...,15,en,en,en
1,1,Mr. Carter,rap,Lil Wayne,2008,542488,{JAY-Z},produced infamous drew correa intro lil wayne ...,126,en,en,en
2,2,Warning,rap,The Notorious B.I.G.,1994,617475,{},produced easy mo bee verse notorious pop fuck ...,29,en,en,en
3,3,Juicy,rap,The Notorious B.I.G.,1994,3528473,{},intro notorious fuck hoes get grip motherfucke...,43,en,en,en
4,4,D.O.A. Death of Auto-Tune,rap,JAY-Z,2009,261602,{},produced intro la da da da hey hey hey goodbye...,44,en,en,en


In [14]:
class Classifier:
    def __init__(self, X, y, model_name):
        self.model_name = model_name
        self.model = None
        self.X = X
        self.y = y
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2)
        vectorizer = CountVectorizer()
        vectorizer.fit(self.x_train)
        #below are the vectorized word tokens used for training
        self.X_train = vectorizer.transform(self.x_train)
        self.X_test = vectorizer.transform(self.x_test)

    def fit(self, tune = False):

        '''
        Returns the properly fit model depending on the type of classifier we initialize

        Includes option to tune models to evaluate if they improve performance

        returns none

        '''
        if self.model_name == 'rf':
            if tune:
                param_grid = {'n_estimators': [50, 100, 150],
                'max_depth': [None, 5, 10],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]}

                Grid = GridSearchCV(RandomForestClassifier(random_state=42), 
                            param_grid = param_grid,
                            scoring='accuracy',
                            cv=5,
                            n_jobs=-1,
                           )
                self.model= Grid.fit(self.X_train, self.y_train)
            else:
                self.model = RandomForestClassifier(random_state=42).fit(self.X_train, self.y_train)
            

        if self.model_name == 'bnb':
            if tune:
                param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],
                              'fit_prior': [True, False]}

                grid_search = GridSearchCV(BernoulliNB(),
                    param_grid=param_grid,
                    scoring = 'accuracy',
                    n_jobs = -1,
                    cv = 5
                )

                self.model = grid_search.fit(self.X_train, self.y_train)
            else:
                self.model = BernoulliNB().fit(self.X_train, self.y_train)

        if self.model_name == 'gnb':
            if tune:
                param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}

                grid_search = GridSearchCV(GaussianNB(),
                    param_grid=param_grid,
                    scoring = 'accuracy',
                    n_jobs = -1,
                    cv = 5
                )

                self.model = grid_search.fit(self.X_train, self.y_train)
            else:
                self.model = BernoulliNB().fit(self.X_train, self.y_train)
    
    def predict(self):
        '''
            Returns set of predictions for the test set
        '''
        return self.model.predict(self.X_test)
    
    def score(self):
        '''
            Returns accuracy score of the model and the classification report
        '''
        y_pred = self.predict()
        acc_score = accuracy_score(self.y_test, y_pred)
        class_report = classification_report(self.y_test, y_pred)
        return acc_score, class_report
    
    def plot(self):
            '''
                Returns feature importances and plots, also visual of classification score for all 5 genres aka tags
            '''
        




In [17]:
X = SAMPLE['lyrics']
y = SAMPLE['tag']

RF = Classifier(X,y,'rf')
RF.fit(tune = False)

accuracy_rf = RF.score()[0]
classification_report_rf = RF.score()[1]

print("The accuracy achieved by random forest model:", accuracy_rf)
print('#'*60)
print("The classification report of random forest model: \n", classification_report_rf)


The accuracy achieved by random forest model: 0.6158333333333333
############################################################
The classification report of random forest model: 
               precision    recall  f1-score   support

     country       0.51      0.66      0.58       208
        misc       0.84      0.92      0.87       217
         pop       0.47      0.36      0.41       200
         rap       0.85      0.91      0.88       207
          rb       0.49      0.42      0.45       195
        rock       0.40      0.35      0.37       173

    accuracy                           0.62      1200
   macro avg       0.59      0.60      0.59      1200
weighted avg       0.60      0.62      0.60      1200



In [18]:
BNB = Classifier(X,y,'bnb')
BNB.fit(tune = False)

accuracy_bnb = BNB.score()[0]
classification_report_bnb = BNB.score()[1]

print("The accuracy achieved by random forest model:", accuracy_bnb)
print('#'*60)
print("The classification report of random forest model: \n", classification_report_bnb)

The accuracy achieved by random forest model: 0.5533333333333333
############################################################
The classification report of random forest model: 
               precision    recall  f1-score   support

     country       0.51      0.86      0.64       223
        misc       0.95      0.31      0.46       186
         pop       0.59      0.22      0.32       199
         rap       0.86      0.84      0.85       190
          rb       0.47      0.62      0.54       195
        rock       0.36      0.44      0.40       207

    accuracy                           0.55      1200
   macro avg       0.63      0.55      0.54      1200
weighted avg       0.62      0.55      0.54      1200



In [19]:
GNB = Classifier(X,y,'bnb')
GNB.fit(tune = False)

accuracy_gnb = GNB.score()[0]
classification_report_gnb = GNB.score()[1]

print("The accuracy achieved by random forest model:", accuracy_gnb)
print('#'*60)
print("The classification report of random forest model: \n", classification_report_gnb)

The accuracy achieved by random forest model: 0.5141666666666667
############################################################
The classification report of random forest model: 
               precision    recall  f1-score   support

     country       0.34      0.92      0.49       189
        misc       1.00      0.36      0.53       214
         pop       0.48      0.22      0.30       189
         rap       0.93      0.82      0.87       200
          rb       0.54      0.51      0.52       210
        rock       0.36      0.27      0.31       198

    accuracy                           0.51      1200
   macro avg       0.61      0.52      0.50      1200
weighted avg       0.62      0.51      0.51      1200



In [20]:
#lets do tuned models
RF_tune = Classifier(X,y,'rf')
RF_tune.fit(tune = True)

accuracy_rf = RF_tune.score()[0]
classification_report_rf = RF_tune.score()[1]

print("The accuracy achieved by random forest model:", accuracy_rf)
print('#'*60)
print("The classification report of random forest model: \n", classification_report_rf)

BNB_tune = Classifier(X,y,'bnb')
BNB_tune.fit(tune = True)

accuracy_bnb = BNB_tune.score()[0]
classification_report_bnb = BNB_tune.score()[1]

print("The accuracy achieved by random forest model:", accuracy_bnb)
print('#'*60)
print("The classification report of random forest model: \n", classification_report_bnb)

GNB_tune = Classifier(X,y,'bnb')
GNB_tune.fit(tune = True)

accuracy_gnb = GNB_tune.score()[0]
classification_report_gnb = GNB_tune.score()[1]

print("The accuracy achieved by random forest model:", accuracy_gnb)
print('#'*60)
print("The classification report of random forest model: \n", classification_report_gnb)


The accuracy achieved by random forest model: 0.6033333333333334
############################################################
The classification report of random forest model: 
               precision    recall  f1-score   support

     country       0.51      0.74      0.61       196
        misc       0.78      0.93      0.84       188
         pop       0.46      0.36      0.41       195
         rap       0.75      0.93      0.83       174
          rb       0.49      0.50      0.50       208
        rock       0.62      0.28      0.39       239

    accuracy                           0.60      1200
   macro avg       0.60      0.62      0.60      1200
weighted avg       0.60      0.60      0.58      1200

The accuracy achieved by random forest model: 0.5533333333333333
############################################################
The classification report of random forest model: 
               precision    recall  f1-score   support

     country       0.46      0.81      0.58   

In [3]:
x_train, x_test, y_train, y_test = train_test_split(SAMPLE['lyrics'], SAMPLE['tag'], test_size=0.2)
print("Split Created")
vectorizer = CountVectorizer()
vectorizer.fit(x_train)
print("Vectorized")
X_training = vectorizer.transform(x_train)
X_testing = vectorizer.transform(x_test)
print("Transformed")
RF = RandomForestClassifier()
RF.fit(X_training, y_train)
print("Fitted")
y_pred = RF.predict(X_testing)

accuracy_test = accuracy_score(y_test, y_pred)

Split Created
Vectorized
Transformed
Fitted
