In [376]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math
from io import StringIO
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from pprint import pprint
from sklearn.neighbors import KNeighborsClassifier

In [None]:
'''Permutations giving a starting point'''
portnames = ["PAN", "AMS", "CAS", "NYC", "HEL"]
 
def permutations(route, ports):
    if len(ports) == 0:
        print(' '.join(portnames[x] for x in route))
    else:
        for p in range(len(ports)):
            permutations(route + [ports[p]],ports[:p]+ports[p+1:])

# this will start the recursion with 0 as the first stop
permutations([0], list(range(1, len(portnames))))

In [None]:
'''Distance calculation using brute force approaches'''
portnames = ["PAN", "AMS", "CAS", "NYC", "HEL"]

# https://sea-distances.org/
# nautical miles converted to km

D = [
        [0,8943,8019,3652,10545],
        [8943,0,2619,6317,2078],
        [8019,2619,0,5836,4939],
        [3652,6317,5836,0,7825],
        [10545,2078,4939,7825,0]
    ]

# https://timeforchange.org/co2-emissions-shipping-goods
# assume 20g per km per metric ton (of pineapples)

co2 = 0.020

# DATA BLOCK ENDS

# these variables are initialised to nonsensical values
# your program should determine the correct values for them
smallest = 1000000
bestroute = [0, 0, 0, 0, 0]

def permutations(route, ports):
    global smallest, bestroute
    if len(ports) <= 0:
        distances = []
        for i,v in enumerate(route):
            if i == 0:
                continue
            else:
                distances.append(D[route[i-1]][v])
        total_emission = sum(distances) * co2
        if total_emission < smallest:
            smallest = total_emission
            bestroute = route
    for v in range(len(ports)):
        permutations(route + [ports[v]],ports[:v] + ports[v+1:])

def main():
    # this will start the recursion 
    permutations([0], list(range(1, len(portnames))))

    # print the best route and its emissions
    print(' '.join([portnames[i] for i in bestroute]) + " %.1f kg" % smallest)

In [None]:
'''Hill climbing'''                              	 
# generate random mountains                                                                               	 

w = [.05, random.random()/3, random.random()/3]
h = [1.+math.sin(1+x/.6)*w[0]+math.sin(-.3+x/9.)*w[1]+math.sin(-.2+x/30.)*w[2] for x in range(100)]

def climb(x, h):
    # keep climbing until we've found a summit
    summit = False

    # edit here
    while not summit:
        summit = True         # stop unless there's a way up
        for i in range(1,6):
            if h[x + i] > h[x]: #go right
                x = x + i
                summit = False 
        for i in range(1,6):
            if h[x - i] > h[x]: #go left
                x = x - i
                summit = False  

        # if h[x + 1] > h[x]:
        #     x = x + 1         # right is higher, go there
        #     summit = False    # and keep going
    return x


def main(h):
    # start at a random place                                                                                  	 
    x0 = random.randint(1, 98)
    x = climb(x0, h)

    return x0, x

v = main(h)

plt.plot(np.arange(100),h,v[0],h[v[0]],"s",v[1],h[v[1]],"^")
plt.show()

In [None]:
def accept_prob(S_old, S_new, T):
    # this is the acceptance "probability" in the greedy hill-climbing method
    # where new solutions are accepted if and only if they are better
    # than the old one.
    # change it to be the acceptance probability in simulated annealing
    if S_new > S_old:
        return 1
    else:
        return np.exp(-(S_old - S_new) / T)


# the above function will be used as follows. this is shown just for
# your information; you don't have to change anything here
def accept(S_old, S_new, T):
    if random.random() < accept_prob(S_old, S_new, T):
        print(True)
    else:
        print(False)


In [None]:
'''Simulated annealing implementation'''
N = 100     # size of the problem is N x N                                      
steps = 3000    # total number of iterations                                        
tracks = 50

# generate a landscape with multiple local optima                                          
def generator(x, y, x0=0.0, y0=0.0):
    return np.sin((x/N-x0)*np.pi)+np.sin((y/N-y0)*np.pi)+\
        .07*np.cos(12*(x/N-x0)*np.pi)+.07*np.cos(12*(y/N-y0)*np.pi)

x0 = np.random.random() - 0.5
y0 = np.random.random() - 0.5
h = np.fromfunction(np.vectorize(generator), (N, N), x0=x0, y0=y0, dtype=int)
peak_x, peak_y = np.unravel_index(np.argmax(h), h.shape)

# starting points                                                               
x = np.random.randint(0, N, tracks)
y = np.random.randint(0, N, tracks)

def main():
    global x
    global y

    for step in range(steps):
        # add a temperature schedule here
        T = max(.0001, ((steps - step)/steps)**3-.005)
        # update solutions on each search track                                     
        for i in range(tracks):
            # try a new solution near the current one                               
            x_new = np.random.randint(max(0, x[i]-2), min(N, x[i]+2+1))
            y_new = np.random.randint(max(0, y[i]-2), min(N, y[i]+2+1))
            S_old = h[x[i], y[i]]
            S_new = h[x_new, y_new]

            # change this to use simulated annealing
            if S_new > S_old:
                x[i], y[i] = x_new, y_new   # new solution is better, go there       
            else:
                if random.random() < np.exp(-(S_old - S_new) / T):
                    x[i], y[i] = x_new, y_new

    # Number of tracks found the peak
    print(sum([x[j] == peak_x and y[j] == peak_y for j in range(tracks)])) 
main()


In [None]:
def generate(p1,size=10000):
    # change this so that it generates 10000 random zeros and ones
    # where the probability of one is p1
    seq = np.random.choice([0,1], p=[1-p1, p1], size=size)
    return seq

def count(seq):
    counter, flag = 0, 1
    for i in seq:
        if i == 1:
            if flag >= 5:
                counter += 1
            else:
                flag += 1
        else:
            flag = 1
    return counter

def main(p1):
    seq = generate(p1)
    return count(seq)

print(main(2/3))

In [None]:
'''probability of choosing a fisher given a sex'''
countries = ['Denmark', 'Finland', 'Iceland', 'Norway', 'Sweden']
populations = [5615000, 5439000, 324000, 5080000, 9609000]
male_fishers = [1822, 2575, 3400, 11291, 1731]
female_fishers = [69, 77, 400, 320, 26] 

def guess(winner_gender):
    if winner_gender == 'female':
        fishers = female_fishers
    else:
        fishers = male_fishers
    # the index of the highest fishing country
    i = fishers.index(max(fishers))
    guess, biggest = countries[i], fishers[i] / sum(fishers)
    return (guess, biggest * 100)  

def main():
    country, fraction = guess("male")
    print("if the winner is male, my guess is he's from %s; probability %.2f%%" % (country, fraction))
    country, fraction = guess("female")
    print("if the winner is female, my guess is she's from %s; probability %.2f%%" % (country, fraction))

main()


In [None]:
'''the bayes rule applied'''
def bot8(pbot, p8_bot, p8_human):
    #the probability of a username
    #with 8 digits
    d_8 = p8_bot * pbot + p8_human * (1-pbot)
    #the probability of being a bot
    #when the username has 8 digits
    '''P(A|B) = [P(B|A) * P(A)]/P(B)'''
    pbot_8 = (p8_bot * pbot) / d_8
    print(pbot_8)

# you can change these values to test your program with different values
pbot = 0.1
p8_bot = 0.8
p8_human = 0.05

bot8(pbot, p8_bot, p8_human)

In [None]:
'''rolling a dice and determining the roll using bayes rule'''
p1 = [1/6, 1/6, 1/6, 1/6, 1/6, 1/6]   # normal
p2 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.5]   # loaded

def roll(loaded):
    if loaded:
        print("rolling a loaded die")
        p = p2
    else:
        print("rolling a normal die")
        p = p1

    # roll the dice 10 times
    # add 1 to get dice rolls from 1 to 6 instead of 0 to 5
    sequence = np.random.choice(6, size=10, p=p) + 1 
    for roll in sequence:
        print("rolled %d" % roll)
        
    return sequence

def bayes(sequence):
    odds = 1.0           # start with odds 1:1
    for roll in sequence:
        odds *= p2[roll-1] / p1[roll-1]             # edit here to update the odds
    if odds > 1:
        return True
    else:
        return False

sequence = roll(True)
if bayes(sequence):
    print("I think loaded")
else:
    print("I think normal")


In [None]:
'''using linear regression for predicting the price of multiple cottages'''
X = [[66, 5, 15, 2, 500], 
     [21, 3, 50, 1, 100], 
     [120, 15, 5, 2, 1200]]
c = [3000, 200, -50, 5000, 100]    # coefficient values

def predict(X, c):
    for e in range(len(X)):
        price = 0
        for i in range(len(X[e])):
            price += X[e][i] * c[i]          
        print(price)

predict(X, c)

In [19]:
'''Calculating error for deciding the best set of coefficients'''
# data
X = np.array([[66, 5, 15, 2, 500], 
              [21, 3, 50, 1, 100], 
              [120, 15, 5, 2, 1200]])
y = np.array([250000, 60000, 525000])

# alternative sets of coefficient values
c = np.array([[3000, 200 , -50, 5000, 100], 
              [2000, -250, -100, 150, 250], 
              [3000, -100, -150, 0, 150]])   

def find_best(X, y, c):
    smallest_error = np.Inf
    best_index = -1
    index = 0
    for coeff in c:
        container = []
        for cabin in X:
            accumulated = 0
            for i in range(len(cabin)):
                accumulated += cabin[i] * coeff[i]
            container.append(accumulated)
        #error
        error = ((y - np.array(container)) ** 2).sum()
        if error < smallest_error:
            smallest_error = error
            best_index = index
        index += 1
    print("the best set is set %d" % best_index)


find_best(X, y, c)


the best set is set 1


In [29]:
'''linear regression on a matrix of values'''

input_string = '''
25 2 50 1 500 127900
39 3 10 1 1000 222100
13 2 13 1 1000 143750
82 5 20 2 120 268000
130 6 10 2 600 460700
115 6 10 1 550 407000
'''

np.set_printoptions(precision=1)    # this just changes the output settings for easier reading
 
def fit_model(input_file):
    matrix = np.genfromtxt(input_file)
    c = np.linalg.lstsq(matrix[:,:5],matrix[:,5],rcond=None)[0]
    x = matrix[:,:5]

    print(c)
    print(x @ c)

# simulate reading a file
input_file = StringIO(input_string)
fit_model(input_file)


[2989.6  800.6  -44.8 3890.8   99.8]
[127907.6 222269.8 143604.5 268017.6 460686.6 406959.9]


In [36]:
'''using linear regression in numpy using train and test data'''
train_string = '''
25 2 50 1 500 127900
39 3 10 1 1000 222100
13 2 13 1 1000 143750
82 5 20 2 120 268000
130 6 10 2 600 460700
115 6 10 1 550 407000
'''

test_string = '''
36 3 15 1 850 196000
75 5 18 2 540 290000
'''

def main():
    np.set_printoptions(precision=1)    # this just changes the output settings for easier reading
    train = np.genfromtxt(StringIO(train_string))
    # read in the training data and separate it to x_train and y_train
    x_train,y_train = train[:,:5],train[:,5]
    # fit a linear regression model to the data and get the coefficients
    c = np.linalg.lstsq(x_train,y_train,rcond=None)[0]

    # read in the test data and separate x_test from it
    test = np.genfromtxt(StringIO(test_string))
    x_test = test[:,:5]

    # print out the linear regression coefficients
    print(c)

    # this will print out the predicted prics for the two new cabins in the test data set
    print(x_test @ c)


main()


[2989.6  800.6  -44.8 3890.8   99.8]
[198102.4 289108.3]


In [24]:
'''nearest neighbor using Euclidean distance between two points'''
x_train = np.random.rand(10, 3)   # generate 10 random vectors of dimension 3
x_test = np.random.rand(3)        # generate one more random vector of the same dimension

def dist(a, b):
    sum = 0
    for ai, bi in zip(a, b):
        sum = sum + (ai - bi)**2
    return np.sqrt(sum)
    
def nearest(x_train, x_test):
    nearest = -1
    min_distance = np.Inf
    distance = 0
    for i,v in enumerate(x_train):
        distance = dist(v,x_test)
        if distance < min_distance:
            min_distance = distance
            nearest = i
    print(nearest)

nearest(x_train, x_test)

8


In [132]:
# create random data with two classes
X, Y = make_blobs(n_samples=16, n_features=2, centers=2, center_box=(-2, 2))

# scale the data so that all values are between 0.0 and 1.0
X = MinMaxScaler().fit_transform(X)

# split two data points from the data as test data and
# use the remaining n-2 points as the training data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=2)

# place-holder for the predicted classes
y_predict = np.empty(len(y_test), dtype=np.int64)

# produce line segments that connect the test data points
# to the nearest neighbors for drawing the chart
lines = []

# distance function
def dist(a, b):
    sum = 0
    for ai, bi in zip(a, b):
        sum = sum + (ai - bi)**2
    return np.sqrt(sum)


def main(X_train, X_test, y_train, y_test):

    global y_predict
    global lines

    k = 3    # classify our test items based on the classes of 3 nearest neighbors

    # process each of the test data points
    for i, test_item in enumerate(X_test):
        # calculate the distances to all training points
        distances = [dist(train_item, test_item) for train_item in X_train]

        # `c` a container for quantities
        c = [np.Inf for i in range(k)]
        nearest = [0 for _ in range(k)]
        # this part will generate an ordered list
        # of k elements sorted ascending
        for _ in range(len(c)):
            for p,v in enumerate(distances):
                if _ == 0:
                    if v < c[_]:
                        c[_] = v
                        nearest[_] = p
                elif v < c[_] and c[_ - 1] < v:
                        c[_] = v
                        nearest[_] = p
        

        # create a line connecting the points for the chart
        for elm in nearest:
            lines.append(np.stack((test_item, X_train[elm])))

        # this dict will hold the count of ones/zeros as element:count pairs
        d = {x:[y_train[i] for i in nearest].count(x) for x in (1,0)}
        y_predict[i] = max(d,key=lambda x: d[x])
    
    print(y_predict)
main(X_train, X_test, y_train, y_test)


[0 0]


In [192]:
'''using Manhattan distance to get the difference of a bag of words'''
import numpy as np

data = [[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1],
        [1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
        [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1],
        [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1],
        [1, 1, 1, 0, 1, 3, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1]]

def find_nearest_pair(data):
    N = len(data)
    dist = np.empty((N, N), dtype=np.float)
    # outer loop
    for ind,val in enumerate(data):
        # inner loop for commparing indices
        for ind1,val1 in enumerate(data):
            # Inf if equal indices
            if ind == ind1:
                dist[ind,ind1] = np.Inf
            else:
                # get the manhattan distance
                manh_dist = 0
                for l0,l1 in zip(val,val1):
                    manh_dist += np.abs(l0 - l1)
                dist[ind,ind1] = manh_dist
    print(np.unravel_index(np.argmin(dist), dist.shape))

find_nearest_pair(data)

(2, 3)


In [370]:
text = '''Humpty Dumpty sat on a wall
Humpty Dumpty had a great fall
all the king's horses and all the king's men
couldn't put Humpty together again'''
import re
def main(text):
    # first part - creating needed variables/data
    lower_words = text.lower().replace('\n',' ').split(' ')
    docs = [[i.lower() for i in line.split()] for line in text.split('\n')]
    unique_words = []
    for word in lower_words:
        if not word in unique_words:
            unique_words.append(word)
    
    # < DATA >
    line_dist = np.zeros((len(docs),len(docs))) # list of distances of length docs*docs
    vector = np.zeros((len(docs),len(unique_words))) # a list of vectors length docs*unique_words
    
    # second part - term frequency/document frequency

    metrics = []
    for line in docs:
        elements = {}
        
        for word in line:
            if not word in elements:
                elements[word] = {
                    'tf':1 / line.count(word),
                    'df':1 / sum([1 for x in docs if word in x])
                }
        metrics.append(elements)
        del elements
    
    # third part - determine tf-idf in a 2-D vector
    
    for r,v in enumerate(vector):
        for c,v1 in enumerate(unique_words):
            if v1 in metrics[r]:
                vector[r,c] = metrics[r][v1]['tf'] * np.log10(1 / metrics[r][v1]['df'])
   
    # fourth part - determine the distance of all the lines (docs)

    for ind,val in enumerate(vector):
        # inner loop for commparing indices
        for ind1,val1 in enumerate(vector):
            # np.Inf if equal indices
            if ind == ind1:
                line_dist[ind,ind1] = np.Inf
            else:
                # get the eulean distance
                manh_dist = 0
                for l0,l1 in zip(val,val1):
                    manh_dist += (l0 - l1) ** 2
                line_dist[ind,ind1] = np.sqrt(manh_dist)
    print(np.unravel_index(np.argmin(line_dist), line_dist.shape))

main(text)

(0, 1)


In [375]:
'''sigmoid function'''

x = np.array([4, 3, 0])
c1 = np.array([-.5, .1, .08])
c2 = np.array([-.2, .2, .31])
c3 = np.array([.5, -.1, 2.53])

def distance(a,b):
    value = 0
    for x,y in zip(a,b):
        value += x*y
    return value


def sigmoid(z):
    return 1/(1+math.exp(-z))
    # print(0)

def main():
    print('c1',sigmoid(distance(x,c1)))
    print('c2',sigmoid(distance(x,c2)))
    print('c3',sigmoid(distance(x,c3)))

main()

c1 0.1544652650835347
c2 0.45016600268752216
c3 0.8455347349164652
