In [1]:
from random import random,randint
import math

def wineprice(rating,age):
  peak_age=rating-50
  
  # Calculate price based on rating
  price = rating/2
  if age > peak_age:
    # Past its peak, goes bad in 10 years
    price = price*(5-(age-peak_age)/2)
  else:
    # Increases to 5x original value as it
    # approaches its peak
    price = price*(5*((age+1)/peak_age))
  if price < 0: price=0

  return price

In [2]:
wineprice(95.0,3.0)
wineprice(99.0,1.0)

10.102040816326529

In [3]:
def wineset1():
  rows=[]
  for i in range(300):
    # Create a random age and rating
    rating=random()*50+50
    age=random()*50

    # Get reference price
    price=wineprice(rating,age)
    
    # Add some noise
    price*=(random()*0.2+0.9)

    # Add to the dataset
    rows.append({'input':(rating,age),
                 'result':price})
  return rows


In [4]:
data = wineset1( )
data[0]

{'input': (95.77418463778027, 23.656186219249747),
 'result': 120.23641505160157}

In [5]:
data[1]

{'input': (77.95763791275188, 22.14948269927329), 'result': 160.22069306892328}

In [6]:
def euclidean(v1,v2):
  d=0.0
  for i in range(len(v1)):
    d+=(v1[i]-v2[i])**2
  return math.sqrt(d)


In [7]:
data[0]['input']

(95.77418463778027, 23.656186219249747)

In [8]:
data[1]['input']

(77.95763791275188, 22.14948269927329)

In [9]:
euclidean(data[0]['input'],data[1]['input'])

17.880142412806148

In [10]:
import math

In [11]:
def manhattan(v1, v2):
    d = 0.0
    for x1, x2 in zip(v1, v2):
        difference = x2 - x1
        absolute_difference = abs(difference)
        d += absolute_difference

    return d

In [12]:
manhattan([1,1],[1,0])

1.0

In [13]:
manhattan([0,1],[1,0])

2.0

In [14]:

import numpy as np

In [15]:
def cosSim(v1,v2):
    dot=0.0
    for i in range(len(v1)):
        dot+=(v1[i]*v2[i])
        len1=0.0
        len2=0.0
    for i in range(len(v1)):
        len1+=(v1[i]*v1[i])
        len2+=(v2[i]*v2[i])
        cos = dot/(math.sqrt(len1)*math.sqrt(len2))
    return cos

In [16]:
cosval = cosSim([1,1],[2,2])
print(cosval)

0.9999999999999998


In [17]:
cosval = cosSim([-1,0],[1,0])
print(cosval)

-1.0


In [18]:
def getdistances(data,vec1):
  distancelist=[]
  
  # Loop over every item in the dataset
  for i in range(len(data)):
    vec2=data[i]['input']
    
    # Add the distance and the index
    distancelist.append((euclidean(vec1,vec2),i))
  
  # Sort by distance
  distancelist.sort()
  return distancelist


In [19]:

def knnestimate(data,vec1,k=5):
  # Get sorted distances
  dlist=getdistances(data,vec1)
  avg=0.0
  
  # Take the average of the top k results
  for i in range(k):
    idx=dlist[i][1]
    avg+=data[idx]['result']
  avg=avg/k
  return avg

#

In [20]:
knnestimate(data,(95.0,3.0))

23.02820281121638

In [21]:
def inverseweight(dist,num=1.0,const=0.1):
  return num/(dist+const)

In [22]:
def subtractweight(dist,const=1.0):
  if dist>const: 
    return 0
  else: 
    return const-dist

In [23]:
def gaussian(dist,sigma=5.0):
  return math.e**(-dist**2/(2*sigma**2))

In [24]:
inverseweight(0.1)

5.0

In [25]:
subtractweight(0.1)

0.9

In [26]:
gaussian(0.1)

0.9998000199986667

In [27]:
inverseweight(1.0)

0.9090909090909091

In [28]:
subtractweight(1.0)

0.0

In [29]:
gaussian(1.0)

0.9801986733067553

In [30]:
def applyWeigthMethod(weightMethod,inputs):
    weights=[]
    for i in range(len(inputs)):
        weights.append(weightMethod(inputs[i]))
    return weights


In [31]:
distances = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
applyWeigthMethod(inverseweight,distances)

[5.0,
 3.333333333333333,
 2.5,
 2.0,
 1.6666666666666667,
 1.4285714285714286,
 1.25,
 1.1111111111111112,
 1.0,
 0.9090909090909091]

In [32]:
applyWeigthMethod(subtractweight,distances)

[0.9,
 0.8,
 0.7,
 0.6,
 0.5,
 0.4,
 0.30000000000000004,
 0.19999999999999996,
 0.09999999999999998,
 0.0]

In [33]:
applyWeigthMethod(gaussian,distances)

[0.9998000199986667,
 0.9992003199146837,
 0.9982016190284373,
 0.9968051145430329,
 0.9950124791926823,
 0.9928258579038134,
 0.9902478635182347,
 0.9872815715902905,
 0.9839305142725083,
 0.9801986733067553]

In [34]:

def weightedknn(data,vec1,k=5,weightf=gaussian):
  # Get distances
  dlist=getdistances(data,vec1)
  avg=0.0
  totalweight=0.0
  
  # Get weighted average
  for i in range(k):
    dist=dlist[i][0]
    idx=dlist[i][1]
    weight=weightf(dist)
    avg+=weight*data[idx]['result']
    totalweight+=weight
  if totalweight==0: return 0
  avg=avg/totalweight
  return avg

In [35]:
weightedknn(data,(99.0,5.0))

39.86537148728295

In [36]:
def dividedata(data,test=0.05):
  trainset=[]
  testset=[]
  for row in data:
    if random()<test:
      testset.append(row)
    else:
      trainset.append(row)
  return trainset,testset
  

In [37]:
def testalgorithm(algf,trainset,testset):
  error=0.0
  for row in testset:
    guess=algf(trainset,row['input'])
    error+=(row['result']-guess)**2
    #print row['result'],guess
    
  #print error/len(testset)
  return error/len(testset)


In [38]:
def crossvalidate(algf,data,trials=100,test=0.1):
  error=0.0
  for i in range(trials):
    trainset,testset=dividedata(data,test)
    error+=testalgorithm(algf,trainset,testset)
  return error/trials

In [39]:
crossvalidate(knnestimate,data)

286.22050079836947

In [40]:
def knn3(d,v): return knnestimate(d,v,k=3)
crossvalidate(knn3,data)

269.9394450708061

In [41]:
def knn1(d,v): return knnestimate(d,v,k=1)
crossvalidate(knn1,data)

268.16735131487565

In [42]:
crossvalidate(weightedknn,data)

249.59661139759854

In [43]:
def knninverse(d,v):
    return weightedknn(d,v,weightf=inverseweight)
crossvalidate(knninverse,data)

214.8237376084945

In [44]:
def wineset2():
  rows=[]
  for i in range(300):
    rating=random()*50+50
    age=random()*50
    aisle=float(randint(1,20)) #
    bottlesize=[375.0,750.0,1500.0][randint(0,2)] #
    price=wineprice(rating,age)
    price*=(bottlesize/750) #
    price*=(random()*0.2+0.9)
    rows.append({'input':(rating,age,aisle,bottlesize),
                 'result':price})
  return rows

In [45]:
data1 = wineset2()

In [46]:
crossvalidate(knn3,data1)

1455.993665947539

In [47]:
crossvalidate(weightedknn,data)

263.49264223862343

In [48]:
def rescale(data,scale):
  scaleddata=[]
  for row in data:
    scaled=[scale[i]*row['input'][i] for i in range(len(scale))]
    scaleddata.append({'input':scaled,'result':row['result']})
  return scaleddata

In [49]:
sdata=rescale(data1,[10,10,0,0.5])
crossvalidate(knn3,sdata)

898.5113273226365

In [50]:
crossvalidate(weightedknn,sdata)

890.54466679497

In [55]:
def rescale(data,scale):
  scaleddata=[]
  for row in data:
    scaled=[scale[i]*row['input'][i] for i in range(len(scale))]
    scaleddata.append({'input':scaled,'result':row['result']})
  return scaleddata

In [56]:
sdata=rescale(data1,[10,10,0,0.5])
crossvalidate(knn3,sdata)

862.4621933919443

In [57]:
crossvalidate(weightedknn,sdata)

917.4831652151703

In [51]:
training_data = [ 
    [100,3.6,85,107],
    [102,2.9,99,129],
    [107,2.2,75,118],
    [141,1.2,150,256],
    [154,1.7,177,263],
    [165,0.8,183,244],
    [211,6,22,308],
    [229,5.7,32,333],
    [233,4.8,62,316]
]

In [52]:
query = [244,0.75,0.285714286, 0]

In [53]:
def weightedknn(data,vec1,k=5,weightf=gaussian):
  # Get distances
  dlist=getdistances(data,vec1)
  avg=0.0
  totalweight=0.0
  
  # Get weighted average
  for i in range(k):
    dist=dlist[i][0]
    idx=dlist[i][1]
    weight=weightf(dist)
    avg+=weight*data[idx]['result']
    totalweight+=weight
  if totalweight==0: return 0
  avg=avg/totalweight
  return avg

In [54]:
crossvalidate(knn3,training_data)

ZeroDivisionError: float division by zero

In [None]:
def getdistances(data,vec1):
  distancelist=[]
  
  # Loop over every item in the dataset
  for i in range(len(data)):
    vec2=data[i]['input']
    
    # Add the distance and the index
    distancelist.append((euclidean(vec1,vec2),i))
  
  # Sort by distance
  distancelist.sort()
  return distancelist

#
# The kNN function uses the list of distances and
# averages the top k results
#

def knnestimate(data,vec1,k=5):
  # Get sorted distances
  dlist=getdistances(data,vec1)
  avg=0.0
  
  # Take the average of the top k results
  for i in range(k):
    idx=dlist[i][1]
    avg+=data[idx]['result']
  avg=avg/k
  return avg