In [1]:
# import libraries need for doing the job
import requests
import numpy
import re
import binascii
import itertools
from time import time
import random
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
import seaborn as sns

##### create class to organize methods used for the process

In [9]:
class BdaTools:
    #Read file from remote and save the content in local file
    def download_remote_file(self, url):
        # local file name
        local_filename = url.split('/')[-1]

        # get file from remote and write the contet to local file
        with requests.get(url, stream=True) as req:
            with open(local_filename, 'wb') as file:
                for content in req.iter_content(chunk_size=8192):
                    file.write(content)

        return local_filename
    
    def read_lyrics(self, filename):
        """ 
        Reads txt file and returns tuple with
        list of top 5,000 words and 
        the index : frequency for each track
        """
        content = [] # list with word index : word count for each track
        string = '%'
        find = False 
        words = [] 
        track_id = [] # list with track ID's from the MSD
        mxm_tid = [] # track ID's from musiXmatch
        str_data = []

        read_file = open(filename, "r")

        for line in read_file:
            if find:
                line = line.strip() 
                index1 = line.find(',') # finds index of 1st comma
                index2 = line.find(',', index1+1) # finds index of 2nd comma
                track_id.append(line[:index1]) # appends track id to list 
                mxm_tid.append(line[:index2]) 
                res = '{' + line[index2+1:] + '}' 
                d = eval(res) # converts string to actual dictionary 
                content.append(d) # appends track data to content list
            else:
                # obtaining line with 5,000 words 
                if line.startswith(string):
                    line = line[1:] # getting rid of %
                    words = [word.strip() for word in line.split(',')]
                    find = True # already found list of words 
        read_file.close() 


        return (words, content, track_id, mxm_tid)
    
    def create_vectors(list_dict, num_words):
    """
    Returns a list x for all the data points. 
    
    Each element of x is a NumPy vector with 5,000 elements, 
    one for each word.
    """
    x = [] 
    for d in list_dict:
        # initializing numpy vector containing 5,000 (number of words) zeros
        temp = np.zeros(num_words, dtype=np.float64)
        for key, val in d.items():
            key -= 1 # indexing in data starts at 1
            temp[key] = 1 # adding word and its frequency to vector 

        x.append(temp) # appends vector to x  

    return x
    
# organize methods for similarity analysis
class SimilarityTools:
    
    maxShingleID = 2**32-1  # record the maximum shingle ID that we assigned
    nextPrime = 4294967311  # next prime number after maxShingleID
    # hash tables (multihash)
    max_hash1 = 5 * 1000000 -673
    max_hash2 = 5 * 1000000 +673
    
    # generate random prime number
    def generate_prime(self,divisor=1,nsig=None, bands=None, rows=None):
        if not bands and not rows:
            return numpy.random.randint(0, self.nextPrime/divisor, size=(nsig,),dtype=numpy.int64)
        elif not rows:
            return numpy.random.randint(0, self.nextPrime/divisor, size=(bands, ))
        else:
            return numpy.random.randint(0, self.nextPrime/divisor, size=(bands, rows))
    
    # get shingles from lyrics
    def get_shingles(self, lyrics, k=5):
        L = len(lyrics)
        shingles = set()
        for i in range(L-k+1):
            shingle = lyrics[i:i+k]
            crc = binascii.crc32(shingle.encode('utf-8')) 
            shingles.add(crc)
        return shingles
    
    # shingle vectors 
    def get_shingles_vectors(self, lyricss,sample=100,k=5):
        shingles_vectors = []
        for lyrics in lyricss[:sample]:
            sh = list(self.get_shingles(lyrics, k=k))
            shingles_vectors.append(sh)
        return shingles_vectors
    
    # jaccard_similarity_score
    def jaccard_similarity_score(self, x, y):
        """
        Jaccard Similarity J (A,B) = | Intersection (A,B) | /
                                        | Union (A,B) |
        """
        intersection_cardinality = len(set(x).intersection(set(y)))
        union_cardinality = len(set(x).union(set(y)))
        return intersection_cardinality / float(union_cardinality)
    
    # vectorized min-hashing
    def minhash_vectorized(self, shingles, A, B, nextPrime, maxShingleID, nsig):
        signature = numpy.ones((nsig,)) * (maxShingleID + 1)

        for ShingleID in shingles:
            hashCodes = ((A*ShingleID + B) % nextPrime) % maxShingleID
            numpy.minimum(signature, hashCodes, out=signature)

        return signature
    
    # minhash candidates
    def get_minhash_candidates(self, domain,A, B, nextPrime, maxShingleID, k=3, s=0.5, nsig=50):
        signatures = []  # signatures for all files
        for lyrics in domain:
            shingles = self.get_shingles(lyrics, k=k)
            signature = self.minhash_vectorized(shingles, A, B, nextPrime, maxShingleID, nsig)
            signatures.append(signature)

        s = s  # similarity threshold
        Nfiles = len(signatures)
        candidates = []
        for i in range(Nfiles):
            for j in range(i+1, Nfiles):
                Jsim = numpy.mean(signatures[i] == signatures[j])  # average number of similar items in 
                if Jsim >= s:                                      # two vectors, equivalente to Jaccard 
                    candidates.append((i,j))
        return candidates
  

In [10]:
# create instance of the class and make it ready for use
tools = BdaTools()

In [5]:
#download file and check file name
musiXmatch_test_data_url = "https://people.arcada.fi/~fentawaw/mxm_dataset_test.txt"
musiXmatch_train_data_url = "https://people.arcada.fi/~fentawaw/mxm_dataset_train.txt"
test_file_name=tools.download_remote_file(musiXmatch_test_data_url)
train_file_name=tools.download_remote_file(musiXmatch_train_data_url)
# check downloaded file
print(f'Test file name is , {test_file_name}')
print(f'Trian file name is , {train_file_name}')

Test file name is , mxm_dataset_test.txt
Trian file name is , mxm_dataset_train.txt


In [17]:
# read lyrices dat for test
words, content, track_id, mxm_tid = tools.read_lyrics(test_file_name)
# print sample data 
print(f'sample from test')
print(f'sample words : {words[:50]}')
print(f'sample content : {content[:2]}')
print(f'sample track_id : {track_id[:2]}')
print(f'sample mxm_tid : {mxm_tid[:2]}')


sample from test
sample words : ['i', 'the', 'you', 'to', 'and', 'a', 'me', 'it', 'not', 'in', 'my', 'is', 'of', 'your', 'that', 'do', 'on', 'are', 'we', 'am', 'will', 'all', 'for', 'no', 'be', 'have', 'love', 'so', 'know', 'this', 'but', 'with', 'what', 'just', 'when', 'like', 'now', 'que', 'time', 'can', 'come', 'de', 'there', 'go', 'up', 'oh', 'la', 'one', 'they', 'out']
sample content : [{2: 19, 4: 7, 5: 6, 10: 1, 12: 13, 13: 6, 17: 4, 18: 6, 22: 1, 23: 1, 30: 11, 32: 4, 33: 6, 46: 8, 60: 1, 73: 1, 82: 1, 89: 1, 103: 5, 116: 1, 118: 5, 134: 1, 162: 1, 184: 1, 201: 3, 212: 5, 234: 5, 260: 3, 268: 4, 274: 4, 275: 1, 279: 4, 297: 1, 351: 6, 404: 9, 449: 4, 462: 1, 484: 4, 517: 6, 521: 5, 730: 5, 814: 1, 878: 1, 1003: 10, 1133: 5, 1649: 7, 2090: 5, 2258: 1, 2358: 1, 2740: 4, 3016: 1, 3024: 1, 3270: 7, 3741: 9, 4435: 4}, {1: 79, 2: 66, 3: 15, 4: 7, 5: 8, 6: 9, 7: 5, 8: 5, 9: 4, 10: 57, 11: 5, 12: 4, 13: 2, 14: 3, 15: 2, 17: 1, 18: 6, 19: 1, 20: 56, 21: 4, 22: 3, 23: 2, 24: 1, 25: 5, 28:

In [18]:
# read lyrices dat for train 
_words, _content, _track_id, _mxm_tid = tools.read_lyrics(train_file_name)

# print sample data 
print(f'sample from train')
print(f'sample words : {_words[:50]}')
print(f'sample content : {_content[:2]}')
print(f'sample track_id : {_track_id[:2]}')
print(f'sample mxm_tid : {_mxm_tid[:2]}')


sample from train
sample words : ['i', 'the', 'you', 'to', 'and', 'a', 'me', 'it', 'not', 'in', 'my', 'is', 'of', 'your', 'that', 'do', 'on', 'are', 'we', 'am', 'will', 'all', 'for', 'no', 'be', 'have', 'love', 'so', 'know', 'this', 'but', 'with', 'what', 'just', 'when', 'like', 'now', 'que', 'time', 'can', 'come', 'de', 'there', 'go', 'up', 'oh', 'la', 'one', 'they', 'out']
sample content : [{1: 6, 2: 4, 3: 2, 4: 2, 5: 5, 6: 3, 7: 1, 8: 1, 11: 1, 12: 2, 13: 3, 14: 1, 15: 1, 18: 2, 19: 2, 20: 2, 21: 2, 23: 4, 25: 1, 26: 2, 28: 1, 30: 1, 36: 2, 42: 1, 45: 1, 54: 2, 56: 1, 57: 1, 68: 1, 99: 1, 192: 2, 249: 1, 264: 1, 356: 1, 389: 1, 561: 1, 639: 1, 656: 1, 687: 1, 761: 1, 773: 1, 804: 1, 869: 2, 914: 1, 1035: 1, 1156: 1, 1221: 1, 1287: 1, 1364: 1, 1407: 1, 1533: 2, 1857: 1, 2096: 1, 2117: 1, 2482: 2, 2548: 1, 2705: 1, 2723: 1, 2868: 2, 2992: 2, 3455: 1, 3717: 1, 3851: 1, 4322: 1, 4382: 1, 4613: 1, 4713: 1, 4906: 1}, {1: 10, 3: 17, 4: 8, 5: 2, 6: 2, 7: 1, 8: 3, 9: 2, 10: 3, 11: 4, 12: 3, 

In [None]:
# bag