In [1]:
import json
import time

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack

from gensim import corpora, models, similarities

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [2]:
with open("lyrics_rock.json",'r') as load_f:
    rock = json.load(load_f)
    
with open("lyrics_folk.json",'r') as load_f:
    folk = json.load(load_f)

In [3]:
rock_corpus = []

for key in rock:
    rock_corpus.append(rock[key].strip().split(' '))
    
folk_corpus = []

for key in folk:
    folk_corpus.append(folk[key].strip().split(' '))
    
test_corpus = folk_corpus[-1]
folk_corpus = folk_corpus[:-1]

In [4]:
dictionary = corpora.Dictionary(rock_corpus + folk_corpus)

In [5]:
corpus = [dictionary.doc2bow(doc) for doc in rock_corpus+folk_corpus]
test_corpus = dictionary.doc2bow(test_corpus)

In [6]:
tfidf = models.TfidfModel(corpus)

In [7]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[test_corpus]]

In [8]:
np.array(sim[834:]).mean()

0.01970354

In [9]:
np.array(sim[:834]).mean()

0.012341033

In [10]:
rock_test = [1 for i in range(834)]
folk_test = [0 for i in range(67)]
labels = rock_test + folk_test

In [11]:
text = []

for x in rock_corpus:
    text.append(' '.join(x))
    
for x in folk_corpus:
    text.append(' '.join(x))

In [12]:
vectorizer = CountVectorizer()
cif = vectorizer.fit_transform(text)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(cif)

In [13]:
x_train,x_test,y_train,y_test = train_test_split(tfidf.toarray(),labels)

In [14]:
knn_classifier=KNeighborsClassifier(6)
knn_classifier.fit(x_train,y_train)
y_predict=knn_classifier.predict(x_test)
knn_classifier.score(x_test,y_test)

0.9203539823008849

In [15]:
bayes_classifier=MultinomialNB()
bayes_classifier.fit(x_train,y_train)
y_predict=bayes_classifier.predict(x_test)
bayes_classifier.score(x_test,y_test)

0.9203539823008849

In [16]:
svm_classifier=SVC()
svm_classifier.fit(x_train,y_train)
y_predict=svm_classifier.predict(x_test)
svm_classifier.score(x_test,y_test)



0.9203539823008849

In [17]:
length = []

for x in rock_corpus:
    length.append(len(x))
    
for x in folk_corpus:
    length.append(len(x))

In [18]:
new_X = hstack([tfidf.toarray(), csr_matrix(length).T], 'csr')
x_train,x_test,y_train,y_test = train_test_split(tfidf.toarray(),labels)

In [19]:
knn_classifier=KNeighborsClassifier(6)
knn_classifier.fit(x_train,y_train)
y_predict=knn_classifier.predict(x_test)
knn_classifier.score(x_test,y_test)

0.9690265486725663

In [20]:
bayes_classifier=MultinomialNB()
bayes_classifier.fit(x_train,y_train)
y_predict=bayes_classifier.predict(x_test)
bayes_classifier.score(x_test,y_test)

0.9690265486725663

In [21]:
svm_classifier=SVC()
svm_classifier.fit(x_train,y_train)
y_predict=svm_classifier.predict(x_test)
svm_classifier.score(x_test,y_test)



0.9690265486725663