In [1]:
users_interests = [ 
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"], 
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"], 
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"], 
    ["R", "Python", "statistics", "regression", "probability"], 
    ["machine learning", "regression", "decision trees", "libsvm"], 
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"], 
    ["statistics", "probability", "mathematics", "theory"], 
    ["machine learning", "scikit-learn", "Mahout", "neural networks"], 
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"], 
    ["Hadoop", "Java", "MapReduce", "Big Data"], 
    ["statistics", "R", "statsmodels"], 
    ["C++", "deep learning", "artificial intelligence", "probability"], 
    ["pandas", "R", "Python"], 
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"], 
    ["libsvm", "regression", "support vector machines"] 
]

In [2]:
from collections import Counter

popular_interests = Counter(interest for user_interests in users_interests
                           for interest in user_interests)

display(popular_interests)

Counter({'Hadoop': 2,
         'Big Data': 3,
         'HBase': 3,
         'Java': 3,
         'Spark': 1,
         'Storm': 1,
         'Cassandra': 2,
         'NoSQL': 1,
         'MongoDB': 2,
         'Postgres': 2,
         'Python': 4,
         'scikit-learn': 2,
         'scipy': 1,
         'numpy': 1,
         'statsmodels': 2,
         'pandas': 2,
         'R': 4,
         'statistics': 3,
         'regression': 3,
         'probability': 3,
         'machine learning': 2,
         'decision trees': 1,
         'libsvm': 2,
         'C++': 2,
         'Haskell': 1,
         'programming languages': 1,
         'mathematics': 1,
         'theory': 1,
         'Mahout': 1,
         'neural networks': 2,
         'deep learning': 2,
         'artificial intelligence': 2,
         'MapReduce': 1,
         'databases': 1,
         'MySQL': 1,
         'support vector machines': 1})

In [3]:
from typing import List, Tuple

def most_popular_new_interests(user_interests: List[str],
                              max_results: int = 5
                              ) -> List[Tuple[str, int]]:
    suggestions = [(interest, frequency)
                  for interest, frequency in popular_interests.most_common()
                  if interest not in user_interests]
    return suggestions[:max_results]

In [4]:
most_popular_new_interests(["Python", "libsvm", "scikit-learn", "jabbascript"])


[('R', 4), ('Big Data', 3), ('HBase', 3), ('Java', 3), ('statistics', 3)]

## Collaborative filtering

### TODO: Se om man kan jämföra user[0]'s interests bara
Med alla andra users. Istället för att som i exemplet som jämför alla users interests till andra andras, vilket är typ n^2 complexity?

In [5]:
import math

unique_interests = sorted({interest 
                           for user_interests in users_interests 
                           for interest in user_interests})

assert unique_interests[:6] == [ 
    'Big Data', 
    'C++', 
    'Cassandra', 
    'HBase', 
    'Hadoop', 
    'Haskell', 
    # ... 
]

def make_user_interest_vector(user_interests: List[str]) -> List[int]: 
    """ 
    Given a list of interests, produce a vector whose ith element is 1 
    if unique_interests[i] is in the list, 0 otherwise 
    """ 
    return [1 if interest in user_interests else 0 
            for interest in unique_interests]

user_interest_vectors = [make_user_interest_vector(user_interests) 
                         for user_interests in users_interests]

### Calculate every user's interests against every other users

In [6]:
Vector = List[float]

def dot(v: Vector, w: Vector) -> float:
    """Computes v_1 * w_1 + ... + v_n * w_n"""
    assert len(v) == len(w), "vectors must be same length"

    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def cosine_similarity(v1: Vector, v2: Vector) -> float:
    return dot(v1, v2) / math.sqrt(dot(v1, v1) * dot(v2, v2))

user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j) 
                      for interest_vector_j in user_interest_vectors] 
                     for interest_vector_i in user_interest_vectors]

# Users 0 and 9 share interests in Hadoop, Java, and Big Data 
assert 0.56 < user_similarities[0][9] < 0.58, "several shared interests" 
 
# Users 0 and 8 share only one interest: Big Data 
assert 0.18 < user_similarities[0][8] < 0.20, "only one shared interest"

user_similarities[0][9]

0.5669467095138409

### Calculate only user [0]'s interests against others

In [7]:
Vector = List[float]
pelles_interests = user_interest_vectors[0]

def dot(v: Vector, w: Vector) -> float:
    """Computes v_1 * w_1 + ... + v_n * w_n"""
    assert len(v) == len(w), "vectors must be same length"

    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def cosine_similarity(v1: Vector, v2: Vector) -> float:
    return dot(v1, v2) / math.sqrt(dot(v1, v1) * dot(v2, v2))

pelles_similarities = [cosine_similarity(pelles_interests, interest_vector_i)
                     for interest_vector_i in user_interest_vectors]

# Users 0 and 9 share interests in Hadoop, Java, and Big Data 
#assert 0.56 < user_similarities[0][9] < 0.58, "several shared interests" 
 
# Users 0 and 8 share only one interest: Big Data 
#assert 0.18 < user_similarities[0][8] < 0.20, "only one shared interest"

pelles_similarities[9]

0.5669467095138409

# Spotify MPD Collaborative Filtering
- Idé: Rekommendera ett namn för givna spellistan genom att ge den samma namn som den närmaste playlisten.
- Skapa en funktion som konverterar spellistorna att ha bara artisten och namnet i sig.

In [8]:
import json
import os

playlists = []
data_path = "mpd/data_samples"
num_files = len(os.listdir(data_path))

for filename in os.listdir(data_path):
    with open(os.path.join(data_path, filename), "rt", encoding="utf-8") as f:
        playlists.extend(json.load(f)["playlists"])


In [9]:
unique_songs = sorted({f"{song['artist_name']} - {song['track_name']}"
                           for playlist in playlists
                           for i, song in enumerate(playlist["tracks"])})
print("Number of jsons: ", num_files)
print("Unique songs:", len(unique_songs))

Number of jsons:  1
Unique songs: 33026


In [10]:
[f"{s['artist_name']} - {s['track_name']}" for i, s in enumerate(playlists[3]["tracks"])]

['Jagged Edge - Where the Party At',
 'Rihanna - Pour It Up',
 'Jeremih - oui',
 'Diddy - I Need A Girl (Part One) (feat. Usher & Loon)',
 'Fat Joe - All The Way Up',
 'Adele - Send My Love (To Your New Lover)',
 'Shawn Mendes - Treat You Better',
 'Drake - Too Good',
 'FRENSHIP - Capsize',
 'Flume - Say It',
 'DRAM - Broccoli (feat. Lil Yachty)',
 'Kanye West - Champions',
 'DJ Drama - Wishing (feat. Chris Brown, Skeme & Lyquin)',
 'Denzel Curry - Ultimate',
 'Chance The Rapper - Angels (feat. Saba)',
 'Chance The Rapper - Summer Friends (feat. Jeremih & Francis & The Lights)',
 'Nick Jonas - Bacon',
 'Nick Jonas - Champagne Problems',
 'Twenty One Pilots - Heathens',
 'Twenty One Pilots - Ride',
 'Twenty One Pilots - Heavydirtysoul',
 'Moguai - Hold On - Radio Edit']

In [11]:
def make_playlist_vector(playlist: dict) -> List[int]:
    """
    Given a list of interests, produce a vector whose ith element is 1
    if unique_interests[i] is in the list, 0 otherwise
    """
    return [
        1
        if song
        in [f"{s['artist_name']} - {s['track_name']}" for i, s in enumerate(playlist["tracks"])]
        else 0
        for song in unique_songs #[:100] for testing
    ]

import time

t0 = time.time()

playlists_vectors = [make_playlist_vector(playlist) for playlist in playlists] # [:100] for testing
t1 = time.time()
print(f"Time taken: {round(t1-t0, 2)} s")
print("vectors length:", len(playlists_vectors))

import numpy as np
p = np.array(playlists_vectors)
sum(p.flatten())

vectors length: 1000


62909

In [12]:
import numpy as np
p = np.array(playlists_vectors)
sum(p.flatten())

62909

In [14]:
Vector = List[float]
pelles_interests = playlists_vectors[0]

def dot(v: Vector, w: Vector) -> float:
    """Computes v_1 * w_1 + ... + v_n * w_n"""
    assert len(v) == len(w), "vectors must be same length"

    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def cosine_similarity(v1: Vector, v2: Vector) -> float:
    return dot(v1, v2) / math.sqrt(dot(v1, v1) * dot(v2, v2))

pelles_similarities = [cosine_similarity(pelles_interests, playlist_vector_i)
                     for playlist_vector_i in playlists_vectors]

# Users 0 and 9 share interests in Hadoop, Java, and Big Data 
#assert 0.56 < user_similarities[0][9] < 0.58, "several shared interests" 
 
# Users 0 and 8 share only one interest: Big Data 
#assert 0.18 < user_similarities[0][8] < 0.20, "only one shared interest"

pelles_similarities[9]

0.0

In [18]:
for i, sim in enumerate(pelles_similarities):
    if (sim > 0.08):
        print(f"Similarity to playlists[{i}]: ", sim)

Similarity to playlists[0]:  1.0
Similarity to playlists[16]:  0.09672388203287414
Similarity to playlists[333]:  0.0808158687660548
