## Dependencies

In [175]:
#Dependencies
import pandas as pd
import numpy as np
import pickle

## Reading data

In [176]:
#assigning paths
lyrics = "lyrics.csv"
tracks = "tracks.csv"

In [112]:
#turning csvs into dataframes
lyrics_df = pd.read_csv(lyrics)
tracks_df = pd.read_csv(tracks)

## Cleansing data

In [113]:
#Dropping unwanted columns
lyrics_df.drop(labels = ['lyrics_id', 'track_id', 'lyrics_language', 'updated_time'], axis = 1, inplace = True)
tracks_df.drop(labels = ['lyrics_id', 'track_id', 'first_release_date', 'album_id', 'artist_id', 'explicit'], axis = 1, inplace = True)

In [114]:
#merge tracks & lyrics together
lyrics_tracks = pd.merge(tracks_df, lyrics_df, on = "id")

In [115]:
#keeping only tracks that have lyrics and that are english
lyrics_tracks = lyrics_tracks[(lyrics_tracks['has_lyrics'] == 1) & (lyrics_tracks['lyrics_language_description'] == "English")]

In [116]:
#dropping unwanted columns
lyrics_tracks.drop(labels = ['id', 'has_lyrics', 'lyrics_language_description'], axis = 1, inplace = True)

In [117]:
#combining all text into one column
lyrics_tracks['combined_text'] = lyrics_tracks['artist_name'] + " " + lyrics_tracks['track_name'] + ' ' + lyrics_tracks['lyrics_body']

In [118]:
#dropping unwanted columns
lyrics_tracks.drop(labels = ['artist_name', 'track_name', 'lyrics_body'], axis = 1, inplace = True)

In [119]:
#dropping all null values
lyrics_tracks.dropna(inplace = True)

In [120]:
#checking the cleaned data frame
lyrics_tracks.head()

Unnamed: 0,explicit,combined_text
0,1,"Drake In My Feelings Trap, TrapMoneyBenny\nThi..."
1,1,"DJ Khaled feat. Justin Bieber, Quavo & Chance ..."
2,1,Maroon 5 feat. Cardi B Girls Like You Spent tw...
4,1,6ix9ine feat. Nicki Minaj & Murda Beatz FEFE I...
5,0,Kenny Chesney Get Along Met a man wearin' a t-...


## Vectorize the data

In [121]:
#dependencies
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer, PorterStemmer
import nltk
import string
import re
from sklearn.feature_extraction.text import CountVectorizer

In [122]:
# defining the stopwords
stopword = stopwords.words('english')

#instantiating lemmatization and stemming objects
wn = WordNetLemmatizer()
ps = PorterStemmer()

In [123]:
def clean_text(text):
    '''
    Function accepts a text input and does three things:
    1. Removes punctuation
    2. Splits into tokens
    3. Removes tokens that are stopwords, conducts stemming, and joins together into a single string
    '''
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([ps.stem(word) for word in tokens if word not in stopword])
    return text

In [124]:
#cleans text using clean_text() function
lyrics_tracks['body_text_clean'] = lyrics_tracks['combined_text'].apply(lambda x: clean_text(x))

In [127]:
#check the dataframe
lyrics_tracks.head()

Unnamed: 0,explicit,combined_text,body_text_clean
0,1,"Drake In My Feelings Trap, TrapMoneyBenny\nThi...",drake feel trap trapmoneybenni shit got feel g...
1,1,"DJ Khaled feat. Justin Bieber, Quavo & Chance ...",dj khale feat justin bieber quavo chanc rapper...
2,1,Maroon 5 feat. Cardi B Girls Like You Spent tw...,maroon 5 feat cardi b girl like spent twentyfo...
4,1,6ix9ine feat. Nicki Minaj & Murda Beatz FEFE I...,6ix9in feat nicki minaj murda beatz fefe fucki...
5,0,Kenny Chesney Get Along Met a man wearin' a t-...,kenni chesney get along met man wearin tshirt ...


In [132]:
#Instantiate a countvectorizer object
ngram_vect = CountVectorizer(ngram_range = (1,2))

In [133]:
#fit and transform the model on the cleansed column
X_counts = ngram_vect.fit_transform(lyrics_tracks['body_text_clean'])

In [177]:
#check the model shape and features
print(X_counts.shape)
print(ngram_vect.get_feature_names())

(1235, 56421)
['000', '000 deep', '000 kick', '03', '03 stand', '04', '04 v6', '10', '10 caus', '10 feet', '10 look', '10 oclock', '10 pm', '10 rich', '10 whiskey', '10 your', '100', '100 billion', '100 im', '100 proof', '100 real', '100 shade', '1000', '1000 feet', '1035', '1035 hey', '109', '109 come', '11', '11 12', '11 aint', '11 time', '1115', '1115 alright', '12', '12 03', '12 get', '12 step', '1218', '1218 miss', '1230', '1230 flight', '1234', '1234 fight', '1236', '1236 wish', '12pack', '12pack tank', '12th', '12th floor', '13', '13 hello', '14', '14 go', '1409617854061', '14th', '14th februari', '14th octob', '15', '15 aint', '15 hang', '15 hundr', '15 minut', '150', '150 dont', '16', '16 16', '16 drive', '16 go', '16 home', '16 wonka', '16 yeah', '16 year', '18', '18 girl', '182', '182 youd', '1966', '1966 might', '1970', '1970 handmedown', '1975', '1975 accabonac', '1975 love', '1986', '1986 might', '1991', '1991 outta', '1992', '1992 morn', '1997', '1997 queen', '1da', '1da

In [138]:
#Put back into a dataframe
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df.columns = ngram_vect.get_feature_names()

In [150]:
#Check the X array
X_counts_df.head()

Unnamed: 0,000,000 deep,000 kick,03,03 stand,04,04 v6,10,10 caus,10 feet,...,zombieiei lyric,zone,zone cant,zone got,zone im,zone look,zone talk,zone tell,zooey,zooey deschanel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [139]:
#Assign the target to y variable
y = lyrics_tracks['explicit']

In [201]:
vectorized_df = X_counts_df.copy()
vectorized_df['target'] = y
vectorized_df.to_csv('vectorized.csv')

## Machine learning

In [140]:
#dependencies
from sklearn.model_selection import train_test_split

#Train / test split
X_train, X_test, y_train, y_test = train_test_split(X_counts_df, y, stratify=y)

### Logistic Regression

In [155]:
#Dependencies
from sklearn.linear_model import LogisticRegression

#instantiate a logistic regression model
log_classifier = LogisticRegression()

In [157]:
#fit the classifier to the train dataset
log_classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [158]:
#check the scores of teh model
print(f"Training Data Score: {log_classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {log_classifier.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8058252427184466


In [160]:
#create predictions from the model
log_predictions = log_classifier.predict(X_test)

In [161]:
#print a classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, log_predictions))

             precision    recall  f1-score   support

          0       0.81      0.95      0.87       215
          1       0.80      0.48      0.60        94

avg / total       0.81      0.81      0.79       309



In [191]:
#Pickle the model
joblib.dump(log_classifier, 'log_classifier.pkl')

['log_classifier.pkl']

### Random Forest

In [170]:
# Create a Random forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50)

In [171]:
# Fit the classifier to the data
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [172]:
#Check the score of the model
rf.score(X_test, y_test)

0.79935275080906154

In [173]:
#create predictions from the model
rf_predictions = rf.predict(X_test)

In [174]:
#print a classification report
print(classification_report(y_test, rf_predictions))

             precision    recall  f1-score   support

          0       0.78      1.00      0.87       215
          1       0.97      0.35      0.52        94

avg / total       0.84      0.80      0.76       309



### SVM

In [178]:
# Create a support vector machine linear classifer and fit it to the training data
from sklearn.svm import SVC

In [179]:
svm = SVC(kernel = "sigmoid", probability = True)
svm.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [180]:
svm.score(X_test, y_test)

0.69579288025889963

In [181]:
# Print the model score using the test data
svm_predictions = svm.predict(X_test)

In [182]:
#print a classification report
print(classification_report(y_test, svm_predictions))

             precision    recall  f1-score   support

          0       0.70      1.00      0.82       215
          1       0.00      0.00      0.00        94

avg / total       0.48      0.70      0.57       309



  'precision', 'predicted', average, warn_for)


In [None]:
#Look at the probability prediction
pd.DataFrame(model.predict_proba(X_test), columns = ["Category 0", "Category 1"])

### Grid Search

In [183]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [1, 5, 10],
    'gamma': [0.0001, 0.001, 0.01]
}
grid = GridSearchCV(svm, param_grid, verbose = 3)

In [184]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.6957928802588996, total= 2.0min
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.6957928802588996, total= 1.9min
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.5min remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.6948051948051948, total= 1.9min
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.7378640776699029, total= 1.9min
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.7378640776699029, total= 1.9min
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.7402597402597403, total= 1.9min
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8090614886731392, total= 1.5min
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.7540453074433657, total= 1.4min
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.7532467532467533, total= 1.3min
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 53.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [185]:
# List the best parameters for this dataset
print(grid.best_params_)

{'C': 10, 'gamma': 0.001}


In [186]:
# Make predictions with the hypertuned model
grid_predictions = grid.predict(X_test)

In [187]:
print(classification_report(y_test, grid_predictions))

             precision    recall  f1-score   support

          0       0.80      0.91      0.85       215
          1       0.71      0.49      0.58        94

avg / total       0.77      0.78      0.77       309



In [192]:
from sklearn.externals import joblib
joblib.dump(grid, 'grid.pkl') 

['grid.pkl']

In [190]:
grid = joblib.load('grid.pkl')

### KNN

In [195]:
from sklearn.neighbors import KNeighborsClassifier

In [196]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

k: 1, Train/Test Score: 1.000/0.731
k: 3, Train/Test Score: 0.802/0.718
k: 5, Train/Test Score: 0.739/0.718
k: 7, Train/Test Score: 0.728/0.722
k: 9, Train/Test Score: 0.721/0.715
k: 11, Train/Test Score: 0.717/0.702
k: 13, Train/Test Score: 0.712/0.696
k: 15, Train/Test Score: 0.708/0.699
k: 17, Train/Test Score: 0.720/0.706
k: 19, Train/Test Score: 0.716/0.702


NameError: name 'plt' is not defined