In [14]:
#importing required modules
import numpy as np
from random import shuffle
import pandas as pd

## Q1:GCM

In [15]:
#function to calculate similarity score
def cal_sim(stored,stimulus,alpha,beta):
    d=0
    for i in range(len(stimulus)):
        d+=alpha[i]*(stored[i]-stimulus[i])
    #if the distance measure is negative ,taking the absolute value
    if d<0:
        d=abs(d)
    #calculating similarity score from distance
    sim_score = np.exp(-beta*d)
    #returning score
    return sim_score

In [16]:
#function to make category prediction when given test sample
def prediction(stimulus,labels_unique,stimuli_unique,alpha,beta,gamma,count):
    #initializing array to store label probabilities
    label_probs=np.zeros(len(labels_unique))
    #for each unique label
    for i in range(len(labels_unique)):
        #for each unique stimuli
        for j in range(len(stimuli_unique)):
            #multiplying the count(stimuli,category)*sim(unique_stimuli,test_stimuli)
            label_probs[i]+=count[(int(stimuli_unique[j][0]),int(stimuli_unique[j][1]),i+1)]*cal_sim(stimuli_unique[j],stimulus,alpha,beta)
        #multiplying categorys by their respective gamma values
        label_probs[i]=label_probs[i]*gamma[i]
    #getting the category probabilities
    label_probs=label_probs/sum(label_probs)
    #returning the category with highest probability
    return np.argmax(label_probs)+1

In [17]:
def load_data():
    #loading total train data
    total_train=pd.read_csv('X.csv',header=None)
    #loading test data
    test=pd.read_csv('y.csv',header=None)
    #conversion to array
    total_train=np.array(total_train)
    #shuffling total train data to split it into train and validation data
    np.random.shuffle(total_train)
    #splitting into train and validation data
    validate=total_train[60:]
    train=total_train[:60]
    #conversion to array
    test=np.array(test)
    return total_train,train,validate,test

In [18]:
def to_return_count(data):
    #getting unique stimulus values in train
    stimuli_unique = np.unique(data[:,:-1],axis=0)
    #getting unique label values in train
    labels_unique = np.unique(data[:,-1],axis=0)
    #to store the frequencies of (stimulus,category)
    count={}
    #initializing all possible combinations to zeros
    for i in stimuli_unique:
        for j in labels_unique:
            count[(int(i[0]),int(i[1]),int(j))]=0
    #getting the counts
    for i in data:
        count[(int(i[0]),int(i[1]),int(i[2]))]+=1
    return count,labels_unique,stimuli_unique

In [19]:
#function to return optimal parameters
def optimal_params(validate,labels_unique,stimuli_unique,count):
    #lists of possible alpha,possible gamma values for tuning
    possible_alphas=[[2,1],[4,1],[1.5,1]]
    possible_gammas=[[1,1.5,0.5],[1,2,0.5],[1,1.5,0.7]]
    beta=1
    #finding the optimal alpha and gamma pair that gives the max accuracy on the validation data
    max_acc=0
    for alpha in possible_alphas:
         for gamma in possible_gammas:
            curr_acc=0
            for i in range(np.shape(validate)[0]-1):
                pred=prediction(validate[i][:-1],labels_unique,stimuli_unique,alpha,beta,gamma,count)
                if pred==validate[i][-1]:
                    curr_acc+=1
            if curr_acc>max_acc:
                max_acc=curr_acc
                optimal_alpha=alpha
                optimal_gamma=gamma
    #storing optimal values
    alpha=optimal_alpha
    gamma=optimal_gamma
    return alpha,beta,gamma

In [20]:
def q1_prediction():
    #loading data
    total_train,train,validate,test=load_data()
    #getting count dict on train data
    count,labels_unique,stimuli_unique=to_return_count(train)
    #getting optimal parameters
    alpha,beta,gamma=optimal_params(validate,labels_unique,stimuli_unique,count)
    #getting count dict on whole train data
    count,labels_unique,stimuli_unique=to_return_count(train)
    #to store q1 predictions
    q1_preds=[]
    for i in range(len(test)):
        q1_preds.append(prediction(test[i],labels_unique,stimuli_unique,alpha,beta,gamma,count))
    return q1_preds,alpha,beta,gamma

In [21]:
q1_preds,alpha,beta,gamma=q1_prediction()
print('alpa:',alpha)
print('gamma:',gamma)
print('beta:',beta)
test = pd.read_csv('y.csv',header=None)
test = np.array(test)
for i in range(len(test)):
    print('test stimulus:',test[i])
    print('predicted category:',q1_preds[i])

alpa: [2, 1]
gamma: [1, 1.5, 0.5]
beta: 1
test stimulus: [74 67]
predicted category: 2
test stimulus: [69 63]
predicted category: 2
test stimulus: [92 81]
predicted category: 3
test stimulus: [64 61]
predicted category: 2
test stimulus: [66 84]
predicted category: 2
test stimulus: [76 68]
predicted category: 3
test stimulus: [61 58]
predicted category: 2
test stimulus: [64 76]
predicted category: 2
test stimulus: [68 66]
predicted category: 2
test stimulus: [34 61]
predicted category: 1


## Q2:RCM

In [22]:
class dLocalMAP:
    """
    See Anderson (1990, 1991)
    'Categories' renamed 'clusters' to avoid confusion.
    Discrete version.
    
    Stimulus format is a list of integers from 0 to n-1 where n is the number
    of possible features (e.g. [1,0,1])
    
    args: c, alphas
    """
    def __init__(self, args):
        self.partition = [[]]
        self.c, self.alpha = args
        self.alpha0 = sum(self.alpha.T)
        self.N = 0
        
    def probClustVal(self, k, i, val):
        """Find P(j|k)"""
        cj = len([x for x in self.partition[k] if x[i]==val])
        nk = len(self.partition[k])
        return (cj + self.alpha[i][val])/(nk + self.alpha0[i])
    
    def condclusterprob(self, stim, k):
        """Find P(F|k)"""
        pjks = []
        for i in range(len(stim)):
            cj = len([x for x in self.partition[k] if x[i]==stim[i]])
            nk = len(self.partition[k])
            pjks.append( (cj + self.alpha[i][stim[i]])/(nk + self.alpha0[i]) )
        return np.product( pjks )
    
    def posterior(self, stim):
        """Find P(k|F) for each cluster"""
        pk = np.zeros( len(self.partition) )
        pFk = np.zeros( len(self.partition) )
        # existing clusters:
        for k in range(len(self.partition)):
            pk[k] = self.c * len(self.partition[k])/ ((1-self.c) + self.c * self.N)
            if len(self.partition[k])==0: # case of new cluster
                pk[k] = (1-self.c) / (( 1-self.c ) + self.c * self.N)
            pFk[k] = self.condclusterprob( stim, k)
        # put it together
        pkF = (pk*pFk)#/ sum( pk*pFk )
        return pkF
    
    def stimulate(self, stim):
        """Argmax of P(k|F) + P(0|F)"""
        winner = np.argmax( self.posterior(stim) )
        
        #limiting the number of clusters to only three
        if len(self.partition[winner]) == 0 and winner<3:
            self.partition.append( [] )
        #if the winner is >=3 then assigning as the cluster number among 0 to 2 that has highest posterior
        else:
            winner=np.argmax(self.posterior(stim)[:3])
        self.partition[winner].append(stim)
        
        self.N += 1
    
    def query(self, stimulus):
        """Queried value should be -1."""
        qdim = -1
        for i in range(len(stimulus)):
            if stimulus[i] < 0:
                if qdim != -1:
                    raise Exception( "ERROR: Multiple dimensions queried.")
                qdim = i
        
        self.N = sum([len(x) for x in self.partition])
        
        pkF = self.posterior(stimulus)
        pkF = pkF[:-1] / sum(pkF[:-1]) # eliminate `new cluster' prob
        pjF = np.array( [sum( [ pkF[k] * self.probClustVal(k, qdim, j) \
                for k in range(len(self.partition)-1)] ) 
                for j in range(len( self.alpha[qdim] ))] )
        
        return pjF / sum(pjF)

In [23]:
def testlocalmapD():
    #loading train data
    train = pd.read_csv('X.csv',header=None)
    #conversion to list
    stims = np.array(train)  
    for _ in range(1):
        #'c' value is the probability that two objects belong to that same category,
        #since there are 3 different categories in our data
        #c=3/9=0.333
        
        #continuous variable to discrete variable conversion
        #i have converted weights into 3 discrete values 0,1,2 by subtracting 30(hypothetical min weight value) and then dividing by 22 to get 3 bins
        #i have converted heights into 3 discrete values 0,1,2 by subtracting 55(hypothetical min height value) and then dividing by 10 to get 3 bins
        model = dLocalMAP([0.33333,np.ones((3,3))])
        #shuffling train data
        np.random.shuffle(stims)
        #for each train data point
        for s in stims:
            #converting into discrete values
            s[0]=(s[0]-30)//22
            s[1]=(s[1]-55)//10
            #subtracting 1 from category label
            s[-1]=s[-1]-1
            #conversion to list
            s=list(s)
            #adding to model
            model.stimulate(s)
        #reading test data
        test = pd.read_csv('y.csv',header=None)
        #conversion to array
        test = np.array(test)
        #for each test point
        predictions=[]
        for i in test:
            #converting into discrete values
            i[0]=(i[0]-30)//22
            i[1]=(i[1]-55)//10
            #adding category as -1
            i=list(i)+[-1]
            #printing the category prediction
            predictions.append(np.argmax(model.query(i))+1)
            #print('predicted category:',predictions[-1])
        return predictions

In [24]:
q2_preds=testlocalmapD()
test = pd.read_csv('y.csv',header=None)
test = np.array(test)
for i in range(len(test)):
    print('test stimulus:',test[i])
    print('predicted category:',q2_preds[i])

test stimulus: [74 67]
predicted category: 3
test stimulus: [69 63]
predicted category: 2
test stimulus: [92 81]
predicted category: 3
test stimulus: [64 61]
predicted category: 2
test stimulus: [66 84]
predicted category: 2
test stimulus: [76 68]
predicted category: 3
test stimulus: [61 58]
predicted category: 2
test stimulus: [64 76]
predicted category: 2
test stimulus: [68 66]
predicted category: 2
test stimulus: [34 61]
predicted category: 1


## Q3:showing  exchangeability of data does not matter

In [25]:
#there is shuffle function in the load data function itself,so each time we run the q1_prediction the data is exchanged
for i in range(5):
    curr_preds,alpha,beta,gamma=q1_prediction()
    print(curr_preds)

[2, 2, 3, 2, 2, 3, 2, 2, 2, 1]
[2, 2, 3, 2, 2, 3, 2, 2, 2, 1]
[2, 2, 3, 2, 2, 3, 2, 2, 2, 1]
[2, 2, 3, 2, 3, 3, 2, 2, 2, 1]
[2, 2, 3, 2, 2, 3, 2, 2, 2, 1]


In [26]:
#there is shuffle function in the testlocalmap function itself,so each time we run the q1_prediction the data is exchanged
for i in range(5):
    curr_preds=testlocalmapD()
    print(curr_preds)

[3, 2, 3, 2, 2, 3, 2, 2, 2, 1]
[3, 2, 3, 2, 2, 3, 2, 2, 2, 1]
[3, 2, 3, 2, 2, 3, 2, 2, 2, 1]
[3, 1, 3, 1, 2, 3, 1, 2, 2, 1]
[3, 2, 3, 2, 2, 3, 2, 2, 2, 1]


### please run the third part once more if there are any problem