In [1]:
import pandas as pd
from scipy.spatial.distance import squareform, pdist

In [2]:
import logging
import json
import boto3
from botocore.exceptions import ClientError

logging.basicConfig(format='%(asctime)s %(levelname)s %(process)d --- %(name)s %(funcName)20s() : %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)


def download_object(bucket: str, key: str):
    s3_client = boto3.client('s3')
    try:
        s3_response_object = s3_client.get_object(Bucket=bucket, Key=key)
        object_content = s3_response_object['Body'].read().decode('utf-8')
        json_content = json.loads(object_content)
        return json_content
    except ClientError as e:
        logging.error(e)


In [3]:
l = download_object('snowboard-finder', 'raw/mens.json')

18-Jun-21 16:48:45 INFO 90542 --- botocore.credentials                 load() : Found credentials in shared credentials file: ~/.aws/credentials


In [10]:
import logging
import pandas as pd
from scipy.spatial.distance import squareform, pdist

logging.basicConfig(format='%(asctime)s %(levelname)s %(process)d --- %(name)s %(funcName)20s() : %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)


class Similarity:
    logger = logging.getLogger('Similarity')

    def __init__(self):
        self.sel_cols = ['ratings.Riding Style',
                         'ratings.Riding Level', 'ratings.Shape', 'ratings.Camber Profile',
                         'ratings.Stance', 'ratings.Approx. Weight', 'ratings.Powder',
                         'ratings.Turning', 'ratings.Carving', 'ratings.Speed', 'ratings.Uneven',
                         'ratings.Switch', 'ratings.Jumps', 'ratings.Jibbing', 'ratings.Pipe',
                         'ratings.On', 'ratings.Turn', 'ratings.Skidded', 'ratings.Flex',
                         'ratings.Buttering', 'ratings.Edge']

        self.meta_cols = ['id',
                          'meta_data.name',
                          'meta_data.name',
                          'meta_data.gender',
                          'meta_data.url',
                          'meta_data.price',
                          'meta_data.image_url'
                          ]

    def calculate_similarity(self, file, topn=10, similarity_algo='jaccard'):
        df_input = self.__prepare_data(file)

        df_train = pd.get_dummies(df_input[self.sel_cols], columns=self.sel_cols)

        df_distance = self.__distance_function(df_train, similarity_algo)

        sim_list = []
        for idx, row in df_distance.iterrows():
            target = df_distance.iloc[:, idx].nsmallest(topn + 1)
            topn_values = target[target != 0].tolist()
            topn_index = target[target != 0].index
            topn_id = df_input['id'][topn_index].tolist()
            sim_dict = dict(zip(topn_id, topn_values))
            sim_list.append(sim_dict)

        df_input['similar_boards'] = sim_list

        return df_input

    def __prepare_data(self, file: str):
        """

        Args:
            file: input json file downloaded from S3 raw bucket

        Returns:

        """
        df = pd.json_normalize(file)
        return df

    def __distance_function(self, df, similarity_algo):
        dists = pdist(df, similarity_algo)
        return pd.DataFrame(squareform(dists))


In [11]:
sim = Similarity()

In [13]:
df = sim.calculate_similarity(l)