# Homework 3

## Tasks(Cook/Make prediction)

Let’s split the training data (‘trainInteractions.csv.gz’) as follows:

(1) Reviews 1-400,000 for training \
(2) Reviews 400,000-500,000 for validation

1. Evaluate the performance (accuracy) of the baseline model on the validation set you have built 

In [1]:
import gzip
import random
import csv
from collections import defaultdict
from sklearn import linear_model

In [2]:
path="assignment1/trainInteractions.csv.gz"

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [4]:
def build_validate_set(dataset):
    validate_set=[]
    random.seed(50)
    for d in dataset:
        positive_entry=[d[0],d[1],1]
        negative_entry_item_set=itemSet.difference(itemsPerUser[d[0]])
        random_item=random.choice(list(negative_entry_item_set))
        negative_entry=[d[0],random_item,0]
        validate_set.append(positive_entry)
        validate_set.append(negative_entry)
    return validate_set

In [5]:
def build_train_set(dataset):
    train_set=[]
    for d in dataset:
        positive_entry=[d[0],d[1]]
        train_set.append(positive_entry)
    return train_set

In [6]:
dataset = list(readCSV(path))

In [7]:
dataset[0]

('88348277',
 '03969194',
 {'user_id': '88348277',
  'recipe_id': '03969194',
  'date': '2004-12-23',
  'rating': '5'})

In [8]:
itemsPerUser = defaultdict(set)
usersPerItem=defaultdict(set)
itemSet=set([d[1] for d in dataset])

In [9]:
for d in dataset:
    user,item = d[0], d[1]
    itemsPerUser[user].add(item)
    usersPerItem[item].add(user)

In [10]:
train_set=build_train_set(dataset[:400000])
validate_set=build_validate_set(dataset[400000:500000])

In [11]:
train_set[:10]

[['88348277', '03969194'],
 ['86699739', '27096427'],
 ['03425965', '44197323'],
 ['73973193', '24971400'],
 ['15215209', '60170202'],
 ['75799794', '39662395'],
 ['77745222', '88709727'],
 ['80598779', '09359141'],
 ['35769308', '83909791'],
 ['31763244', '20530585']]

In [12]:
validate_set[:10]

[['90764166', '01768679', 1],
 ['90764166', '10493396', 0],
 ['68112239', '24923981', 1],
 ['68112239', '02378326', 0],
 ['32173358', '57597698', 1],
 ['32173358', '27114352', 0],
 ['30893740', '16266088', 1],
 ['30893740', '35819278', 0],
 ['69780905', '62953151', 1],
 ['69780905', '74035464', 0]]

Base line model

In [13]:
recipeCount = defaultdict(int)
totalCooked = 0

# for user,recipe,_ in readCSV("assignment1/trainInteractions.csv.gz"):
#   recipeCount[recipe] += 1
#   totalCooked += 1
for d in train_set:
  recipeCount[d[1]] += 1
  totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
  count += ic
  return1.add(i)
  if count > totalCooked/2: break

In [14]:
total_size=len(validate_set)

In [15]:
correct_size=0
for i in range(total_size):
    sample=validate_set[i]
    item=sample[1]
    predict=0
    if item in return1:
        predict=1
    if predict==sample[2]:
        correct_size+=1

In [16]:
accuracy=correct_size/total_size
accuracy

0.67036

2. See if you can find a better threshold and report its performance on your validation set 

In [17]:
def baseline_model_accuracy(threshhold):
    return1 = set()
    count = 0
    for ic, i in mostPopular:
      count += ic
      return1.add(i)
      if count > totalCooked*threshhold: break
    correct_size=0
    for i in range(total_size):
        sample=validate_set[i]
        item=sample[1]
        predict=0
        if item in return1:
            predict=1
        if predict==sample[2]:
            correct_size+=1
    accuracy=correct_size/total_size
    print([accuracy,threshhold])

We go through 1/10 to 9/10 to check the accuracy

In [18]:
for i in range(10):
    baseline_model_accuracy((i+1)*0.1)

[0.54719, 0.1]
[0.590585, 0.2]
[0.627285, 0.30000000000000004]
[0.653845, 0.4]
[0.67036, 0.5]
[0.6761, 0.6000000000000001]
[0.66722, 0.7000000000000001]
[0.63698, 0.8]
[0.557975, 0.9]
[0.464665, 1.0]


We see that when threshhold is 3*totalCooked/5, the accuracy improve to 0.6761, so 3*totalCooked/5 is a better threshhold

3. A stronger baseline than the one provided might make use of the Jaccard similarity (or another similarity
metric). Given a pair (u,g) in the validation set, consider all training items g′ that user u has cooked. For each, compute the Jaccard similarity between g and g′, i.e., users (in the training set) who have made ′
g and users who have made g . Predict as ‘made’ if the maximum of these Jaccard similarities exceeds a threshold (you may choose the threshold that works best). Report the performance on your validation set (1 mark).

In [24]:
itemsPerUser = defaultdict(set)
usersPerItem=defaultdict(set)
for d in train_set:
    user,item = d[0], d[1]
    itemsPerUser[user].add(item)
    usersPerItem[item].add(user)

In [25]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [26]:
def predictUsingJaccard(item,user,t):
    predict=0
    maxSim=0
    for d in itemsPerUser[user]:
        sim=Jaccard(usersPerItem[d],usersPerItem[item])
        maxSim=max(maxSim,sim)
    if maxSim>t:
        predict=1
    return predict

In [27]:
def jaccard_model_accuracy(t):
    total_size=len(validate_set)
    correct_size=0
    for i in range(total_size):
        sample=validate_set[i]
        item=sample[1]
        user=sample[0]
        predict=predictUsingJaccard(item,user,t)
        if predict==sample[2]:
            correct_size+=1
    accuracy=correct_size/total_size
    print([accuracy,t])

We go through t from 1/10 to 10/10 to check the accuracy

In [29]:
for i in range(10):
    jaccard_model_accuracy((i+1)*0.1)

[0.500045, 0.1]
[0.49333, 0.2]
[0.49139, 0.30000000000000004]
[0.494715, 0.4]
[0.500095, 0.5]
[0.50007, 0.6000000000000001]
[0.5, 0.7000000000000001]
[0.5, 0.8]
[0.5, 0.9]
[0.5, 1.0]


Then we go through t from 1/100 to 1/10 to check the accuracy

In [31]:
for i in range(10):
    jaccard_model_accuracy((i+1)*0.01)

[0.594055, 0.01]
[0.584695, 0.02]
[0.5612, 0.03]
[0.53559, 0.04]
[0.518795, 0.05]
[0.510095, 0.06]
[0.50591, 0.07]
[0.502995, 0.08]
[0.50182, 0.09]
[0.500045, 0.1]


Then we go through t from 1/1000 to 1/100 to check the accuracy

In [33]:
for i in range(10):
    jaccard_model_accuracy((i+1)*0.001)

[0.58898, 0.001]
[0.589675, 0.002]
[0.591, 0.003]
[0.59235, 0.004]
[0.59303, 0.005]
[0.593125, 0.006]
[0.59335, 0.007]
[0.594025, 0.008]
[0.59398, 0.009000000000000001]
[0.594055, 0.01]


In this case, we use 0.01 as our threshhold, and the accuracy on the validation set is 0.594055.

4. Improve the above predictor by incorporating both a Jaccard-based threshold and a popularity based threshold. Report the performance on your validation set.

In [44]:
def jaccardPopularityModel(train_set, test_set, jt=0.01, pt=0.6):
    itemsPerUser = defaultdict(set)
    usersPerItem=defaultdict(set)
    for d in train_set:
        user,item = d[0], d[1]
        itemsPerUser[user].add(item)
        usersPerItem[item].add(user)
    
    itemSet=set([d[1] for d in train_set])
    userSet=set([d[0] for d in train_set])
    
    # calculate average number of recipes made in the train_set
    averageNum=len(train_set)/len(userSet)

    # calculate most popular set in train_set
    recipeCount = defaultdict(int)
    totalCooked = 0
    for d in train_set:
      recipeCount[d[1]] += 1
      totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
      count += ic
      return1.add(i)
      if count > totalCooked*pt: break

    # evalute on test_set
    total_size=len(test_set)
    correct_size=0
    for i in range(total_size):
        sample=test_set[i]
        item=sample[1]
        user=sample[0]
        predict=0
        
        # if we have not met this user before, just recommend the most popular
        if user not in userSet:
            if item in return1:
                predict=1
        else:
            # if we have not met this recipe before, if the user made many recipes before, then recommend
            if item not in itemSet:
                if len(itemsPerUser[user])>averageNum:
                    predict=1
            else:     
                maxSim=0
                for d in itemsPerUser[user]:
                    sim=Jaccard(usersPerItem[d],usersPerItem[item])
                    maxSim=max(maxSim,sim)
                if maxSim>jt and item in return1:
                    predict=1
                
        if predict==sample[2]:
            correct_size+=1
    accuracy=correct_size/total_size
    print([accuracy,jt,pt])

In [45]:
jaccardPopularityModel(train_set,validate_set,0.01,0.6)

[0.691455, 0.01, 0.6]


Therefore the model incorporating Jaccard and popularity on validation set's accuracy is 0.691455.

5. To run our model on the test set, we’ll have to use the files ‘stub Made.txt’ to find the user id/recipe id pairs about which we have to make predictions. Using that data, run the above model and upload your solution to Kaggle. 

We can change the popularity threshhold to see the outcome:

In [51]:
for i in range(20):
    jaccardPopularityModel(train_set,validate_set,0.01,0.50+i*0.01)

[0.684615, 0.01, 0.5]
[0.685685, 0.01, 0.51]
[0.686745, 0.01, 0.52]
[0.68763, 0.01, 0.53]
[0.688175, 0.01, 0.54]
[0.688875, 0.01, 0.55]
[0.689255, 0.01, 0.56]
[0.689655, 0.01, 0.5700000000000001]
[0.690655, 0.01, 0.58]
[0.691285, 0.01, 0.59]
[0.691455, 0.01, 0.6]
[0.69139, 0.01, 0.61]
[0.691215, 0.01, 0.62]
[0.690815, 0.01, 0.63]
[0.690635, 0.01, 0.64]
[0.69077, 0.01, 0.65]
[0.69071, 0.01, 0.66]
[0.69065, 0.01, 0.67]
[0.690305, 0.01, 0.6799999999999999]
[0.690285, 0.01, 0.69]


Therefore we still use 0.6 as our popularity threshhold.

We can see that the popularity threshhold is more significant than similarity threshhold, so we modified our model.

In [77]:
def jaccardPopularityModel(train_set, test_set, jt=0.01, pt=0.6):
    itemsPerUser = defaultdict(set)
    usersPerItem=defaultdict(set)
    for d in train_set:
        user,item = d[0], d[1]
        itemsPerUser[user].add(item)
        usersPerItem[item].add(user)
    
    itemSet=set([d[1] for d in train_set])
    userSet=set([d[0] for d in train_set])
    
    # calculate average number of recipes made in the train_set
    averageNum=len(train_set)/len(userSet)

    # calculate most popular set in train_set
    recipeCount = defaultdict(int)
    totalCooked = 0
    for d in train_set:
        recipeCount[d[1]] += 1
        totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked*pt: break

    # evalute on test_set
    total_size=len(test_set)
    correct_size=0
    for i in range(total_size):
        sample=test_set[i]
        item=sample[1]
        user=sample[0]
        predict=0
        
        # if we have not met this user before, just recommend the most popular
        if user not in userSet:
            if item in return1:
                predict=1
        else:
            # if we have not met this recipe before, if the user made many recipes before, then recommend
            if item not in itemSet:
                if len(itemsPerUser[user])>4/3*averageNum:
                    predict=1
            else:
                if item in return1:
                    predict=1
                else:
                    maxSim=0
                    for d in itemsPerUser[user]:
                        sim=Jaccard(usersPerItem[d],usersPerItem[item])
                        maxSim=max(maxSim,sim)
                    if maxSim>jt:
                        predict=1
                
        if predict==sample[2]:
            correct_size+=1
    accuracy=correct_size/total_size
    print([accuracy,jt,pt])

In [78]:
jaccardPopularityModel(train_set,validate_set,0.01,0.6)

KeyboardInterrupt: 

We modify it to upload the result to kaggle

In [79]:
def jaccardPopularityModel(train_set, jt=0.01, pt=0.6):
    print("predicting....")
    itemsPerUser = defaultdict(set)
    usersPerItem=defaultdict(set)
    for d in train_set:
        user,item = d[0], d[1]
        itemsPerUser[user].add(item)
        usersPerItem[item].add(user)
    
    itemSet=set([d[1] for d in train_set])
    userSet=set([d[0] for d in train_set])
    
    # calculate average number of recipes made in the train_set
    averageNum=len(train_set)/len(userSet)

    # calculate most popular set in train_set
    recipeCount = defaultdict(int)
    totalCooked = 0
    for d in train_set:
        recipeCount[d[1]] += 1
        totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked*pt: break

    predictions = open("predictions_Made.txt", 'w')
    for l in open("stub_Made.txt"):
        if l.startswith("user_id"):
            predictions.write(l)
            continue
        user,item = l.strip().split('-')
        predict=0
        
        # if we have not met this user before, just recommend the most popular
        if user not in userSet:
            if item in return1:
                predict=1
        else:
            # if we have not met this recipe before, if the user made many recipes before, then recommend
            if item not in itemSet:
                if len(itemsPerUser[user])>4*averageNum/3:
                    predict=1
            else:
                if item in return1:
                    predict=1
                else:
                    maxSim=0
                    for d in itemsPerUser[user]:
                        sim=Jaccard(usersPerItem[d],usersPerItem[item])
                        maxSim=max(maxSim,sim)
                    if maxSim>jt:
                        predict=1
        if predict==1:
            predictions.write(user + '-' + item + ",1\n")
        else:
            predictions.write(user + '-' + item + ",0\n")
    predictions.close()
    print("predicting finished!")

In [80]:
train_set=build_train_set(dataset)

In [81]:
jaccardPopularityModel(train_set,0.001,0.6)

predicting....
predicting finished!
