In [3]:
# %load convert_wsc_data.py
import collections
import fnmatch
import hashlib
import json
from os import walk, path

from pandas import DataFrame, Series


def recursively_find_all_files_in_a_dir_given_pattern(inDir, pattern):
    #Get all the files in the appropriate directory having same suffix as 'pattern'
    outfilenames = [] #Will store sensor file names
    for root, dirnames, filenames in walk(inDir):
        for filename in fnmatch.filter(filenames, pattern):
            outfilenames.append(path.join(root, filename))
    return(outfilenames)


def digitize_ids(s):
    return int(hashlib.sha1(s).hexdigest(), 16) % (10 ** 8)


def flatten_new(d, parent_key='', sep='.'):
    """
    This function flattens the JSON format string in variable d
    :param d:
    :param parent_key:
    :param sep:
    :return:
    """
    items = []
    if isinstance(d, collections.MutableMapping):
        for k, v in d.items():
            # print k
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, collections.MutableMapping):
                items.extend(flatten_new(v, new_key, sep=sep).items())
            elif isinstance(v, list):
                if v:
                    items.extend(flatten_new(v, new_key, sep=sep).items())
                else:
                    continue
            else:
                items.append((new_key, v))
    else:
        if len(d) == 1:
            new_key = parent_key
            items.extend(flatten_new(d[0], new_key, sep=sep).items())
        else:
            j = 1
            for myitem in d:
                new_key = parent_key + sep + str(j)
                items.extend(flatten_new(myitem, new_key, sep=sep).items())
                j = j+1
    return dict(items)


def transform(file_name):
    """
    Takes one json file, parses line by line and converts the json content into a pandas data frmae.
    :param file_name: json file name
    :return: pandas data frame
    """
    with open(file_name) as json_file:
        data = json.load(json_file)

    rows = []
    for line in data:
        my_flattened_dict = flatten_new(line)
        rows.append(Series(my_flattened_dict))

    data_frame = DataFrame(rows)
    return data_frame

def transform_json_data(data):
    rows = []
    for line in data:
        my_flattened_dict = flatten_new(line)
        rows.append(Series(my_flattened_dict))

    data_frame = DataFrame(rows)
    return data_frame

In [158]:
import requests
import traceback
import pandas as pd
import numpy as np
import matplotlib.pyplot
from sklearn.metrics import euclidean_distances
from sklearn.preprocessing import LabelEncoder

In [5]:
wsc_feed_url = 'https://wscgateway.clipro.tv/api/videos?token=8yjodvide1ya81t'
print(wsc_feed_url)
feed_response = requests.get(wsc_feed_url)

https://wscgateway.clipro.tv/api/videos?token=8yjodvide1ya81t


In [54]:
if feed_response.status_code == 200:
    df = transform_json_data(feed_response.json())
    #     wsc_data = parse_wsc_feed(feed_response)

    #save_wsc_data(wsc_data)
else:
    print('WSC feed URL is not responding.')
        

In [55]:
df.shape

(100, 994)

In [56]:
df.columns

Index(['contentType', 'creationSettings.creationTimePeriod',
       'creationSettings.ruleName', 'creationSettings.videoType',
       'description', 'duration', 'events.1.actionType',
       'events.1.eventClips.clipSccUrl', 'events.1.eventClips.clipSrtUrl',
       'events.1.eventClips.clipThumbnailUrl',
       ...
       'resolution.height', 'resolution.width', 'title', 'videoSccUrl',
       'videoSrtUrl', 'videoThumbnailUrl', 'videoUrl', 'videoUrls.bitRate',
       'videoUrls.url', 'videoVttUrl'],
      dtype='object', length=994)

In [57]:
df.head(3)

Unnamed: 0,contentType,creationSettings.creationTimePeriod,creationSettings.ruleName,creationSettings.videoType,description,duration,events.1.actionType,events.1.eventClips.clipSccUrl,events.1.eventClips.clipSrtUrl,events.1.eventClips.clipThumbnailUrl,...,resolution.height,resolution.width,title,videoSccUrl,videoSrtUrl,videoThumbnailUrl,videoUrl,videoUrls.bitRate,videoUrls.url,videoVttUrl
0,VIDEO,POST-GAME,Top Plays of the Day,Top Plays,"Top Plays of the Day, 04/02/2018",230564.0,2-pointer,http://wsczoominwestus.azureedge.net/videos/38...,http://wsczoominwestus.azureedge.net/videos/38...,http://wsczoominwestus.azureedge.net/thumbnail...,...,720,1280,"Top Plays of the Day, 04/02/2018",http://wsczoominwestus.azureedge.net/publish/0...,http://wsczoominwestus.azureedge.net/publish/0...,http://wsczoominwestus.azureedge.net/publish/c...,http://wsczoominwestus.azureedge.net/publish/0...,6000,http://wsczoominwestus.azureedge.net/publish/0...,http://wsczoominwestus.azureedge.net/publish/0...
1,VIDEO,POST-GAME,Every Game - Game Highlights,Highlight,Watch the Game Highlights from Villanova Wildc...,114381.0,Block,http://wsczoominwestus.azureedge.net/videos/86...,http://wsczoominwestus.azureedge.net/videos/86...,http://wsczoominwestus.azureedge.net/thumbnail...,...,720,1280,Villanova Wildcats vs. Michigan Wolverines: Ga...,http://wsczoominwestus.azureedge.net/publish/a...,http://wsczoominwestus.azureedge.net/publish/a...,http://wsczoominwestus.azureedge.net/publish/8...,http://wsczoominwestus.azureedge.net/publish/a...,6000,http://wsczoominwestus.azureedge.net/publish/a...,http://wsczoominwestus.azureedge.net/publish/a...
2,VIDEO,POST-GAME,Top Plays of the Day,Top Plays,"Top Plays of the Day, 03/31/2018",234768.0,3-pointer,http://wsczoominwestus.azureedge.net/videos/F8...,http://wsczoominwestus.azureedge.net/videos/F8...,http://wsczoominwestus.azureedge.net/thumbnail...,...,720,1280,"Top Plays of the Day, 03/31/2018",http://wsczoominwestus.azureedge.net/publish/c...,http://wsczoominwestus.azureedge.net/publish/c...,http://wsczoominwestus.azureedge.net/publish/e...,http://wsczoominwestus.azureedge.net/publish/c...,6000,http://wsczoominwestus.azureedge.net/publish/c...,http://wsczoominwestus.azureedge.net/publish/c...


In [251]:
game_df = df[df['creationSettings.ruleName']=='Every Game - Game Highlights']

In [285]:
game_df['id']

1     FC065B5FA833D6FA3D7F21CE30BF2588C0026A12
3     DA474AAF0158FEF9CCCA3D0156DCA438D6326C14
4     3DC6B17A20AFF3A53B09F7BCE926BEB91781CD4E
6     14C5688F162024F218502BF1789232AD6B033B16
7     F8AC7E4C7CD347E221D2DD44D477C01987A83036
9     F40D062EE2D0F4F7B0DBAD61539C5FD4BEA7A800
10    FC7DFB12F36F17042EEB24CE55538AA84BC8D308
12    1FE475A90CD0CF5383B1BED92B5D1BD47706A3E9
13    B2A57E26D5196A9A0D86E9B4892240AA307D3B01
14    32C9BA85CC0888076C4E79D818F23C6E8190D80F
15    A2D946C327BFB2CD692F39F4E6825C85E6BD8A3C
17    EBF4BC1FDDACA25BDE6DB5F46FA85A174BB3A236
18    D2F2A893F33BB2AEE118A270D9883A91DBBD73D9
19    EC5802CAFAFD5CE36C91B0E3DB8DA51175A991AC
20    57C8780836CC6E90A26D7CC578DE2416C6B76A07
22    E74AA2BD7020CC6BD130BDA0DDAFA4390D8053F6
23    F0EB3ACB4EB2AE58CA7A7CED6696340CC1243725
24    517A5F4436A5BE6304E3EB542B38A3D9026DD836
25    2CB03D8893CE40E145A95DA450A39053B3A855A2
26    E66704386D288A42C61FD91D601936DE46ED28BC
27    1A25F2F1485B112744D89C86F3D9630DC83B68E8
28    FD6EF23

In [281]:
video_watched = df[df.id=='FC065B5FA833D6FA3D7F21CE30BF2588C0026A12']
other_video = df[df.id=='FC065B5FA833D6FA3D7F21CE30BF2588C0026A12']

In [282]:
filled_df = game_df.fillna('None')
le = LabelEncoder()
encoded_df = pd.DataFrame()
# filled_df[filled_df['events.1.actionType'].str.contains('<')]
for col in filled_df.columns:
    series = filled_df[col].map({'<': 'lessThan'})
    encoded_df[col] = le.fit_transform(series)

encoded_df.head(2)

Unnamed: 0,contentType,creationSettings.creationTimePeriod,creationSettings.ruleName,creationSettings.videoType,description,duration,events.1.actionType,events.1.eventClips.clipSccUrl,events.1.eventClips.clipSrtUrl,events.1.eventClips.clipThumbnailUrl,...,resolution.height,resolution.width,title,videoSccUrl,videoSrtUrl,videoThumbnailUrl,videoUrl,videoUrls.bitRate,videoUrls.url,videoVttUrl
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,56,56,56,56,56,56,56,56,56,56,...,56,56,56,56,56,56,56,56,56,56


In [283]:
id_encoder = le.fit(filled_df['id'])
# video_watched_encoded = id_encoder.transform(video_watched)
video_ids = id_encoder.transform(['FC065B5FA833D6FA3D7F21CE30BF2588C0026A12', 'FC065B5FA833D6FA3D7F21CE30BF2588C0026A12'])
print(video_ids[0])
video_watched = encoded_df[encoded_df['id']==video_ids[0]]
other_video = encoded_df[encoded_df['id']==video_ids[1]]
euclidean_distances(video_watched, other_video)

77


array([[ 0.]])

In [273]:
# filled_df[filled_df['videoUrls.url']=="http://wsczoominwestus.azureedge.net/publish/533aa7cd-8812-4100-8da4-3fc7ee9c70fd.mp4"]['id']
filled_df[filled_df['videoUrls.url']=="http://wsczoominwestus.azureedge.net/publish/6fa5f750-868f-4d1a-9f42-b7f4dfb74d24.mp4"]['id']

7    F8AC7E4C7CD347E221D2DD44D477C01987A83036
Name: id, dtype: object

In [274]:
frame1= pd.DataFrame()
pd.DataFrame(np.array([(1, 80, 23)], dtype=[('id', np.int32),('url', np.int32),('distance', np.int32)]))

Unnamed: 0,id,url,distance
0,1,80,23


In [302]:
# video_url = "http://wsczoominwestus.azureedge.net/publish/d7d9c317-57e7-41e5-be06-0b1e90c887cd.mp4" #Don't delete
video_url = "http://wsczoominwestus.azureedge.net/publish/1409d5b3-ed67-4085-b8d5-8d7538dbb67d.mp4"
video_id = filled_df[filled_df['videoUrls.url']==video_url]['id']
id_encoder = le.fit(filled_df['id'])
encoded_video_id = id_encoder.transform(video_id.tolist())
video_watched = encoded_df[encoded_df['id']==encoded_video_id[0]]
distances_df = pd.DataFrame()
for row in encoded_df.iterrows():
    if not row[1]['id'] == encoded_video_id[0]:        
        other_video = encoded_df[encoded_df['id']==row[1]['id']]
        distance = euclidean_distances(video_watched, other_video)
        id1 = row[1]['id']
        url1 = row[1]['videoUrls.url']            
        array = np.array([(id1, url1, distance)], dtype=[('video_id', np.int32),('url', np.int32),('distance', np.float32)])
        distances_df = distances_df.append(pd.DataFrame(array))
distances_df.sort_values(by='distance', inplace=True)
top = distances_df.head(5)
print(top)
url_encoder = le.fit(df['videoUrls.url'])
print('Most relevant top five videos:')
for row in top.iterrows():
    url2 = distances_df[distances_df['video_id']==row[1]['url']]['url'].tolist()[0]
    print(url_encoder.inverse_transform(url2))

   video_id  url   distance
0        19   19  31.527765
0        21   21  31.527765
0        22   22  63.055531
0        18   18  63.055531
0        23   23  94.583298
Most relevant top five videos:
http://wsczoominwestus.azureedge.net/publish/3d3b663f-8fe0-48e7-ae3d-6ca304c94aed.mp4
http://wsczoominwestus.azureedge.net/publish/473f7e13-1655-40bf-a91f-351e622a27f3.mp4
http://wsczoominwestus.azureedge.net/publish/4b6bd638-cfcd-4cb9-af00-5f30bf996346.mp4
http://wsczoominwestus.azureedge.net/publish/3b5fb60b-ce0e-4c93-922a-946dbe2a4f7b.mp4
http://wsczoominwestus.azureedge.net/publish/4c0e1d9a-67c6-4115-9566-b788a1fed918.mp4


In [268]:
url_encoder = le.fit(df['videoUrls.url'])
url2 = distances_df[distances_df['video_id']==19]['url'].tolist()[0]
print(url_encoder.inverse_transform(url2))
url3 = distances_df[distances_df['video_id']==8]['url'].tolist()[0]
print(url_encoder.inverse_transform(url3))

http://wsczoominwestus.azureedge.net/publish/0d5c09dc-de06-4f72-a1ed-1419e6720f08.mp4
http://wsczoominwestus.azureedge.net/publish/16d07090-da26-4cfb-bd86-1f4f32afcb47.mp4
