In [30]:
import gzip
import random
import csv
from collections import defaultdict
from sklearn import linear_model

In [31]:
path="trainInteractions.csv.gz"

In [32]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [33]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [34]:
dataset = list(readCSV(path))

In [35]:
dataset[0]

('88348277',
 '03969194',
 {'user_id': '88348277',
  'recipe_id': '03969194',
  'date': '2004-12-23',
  'rating': '5'})

In [36]:
itemsPerUser = defaultdict(set)
usersPerItem=defaultdict(set)
itemSet=set([d[1] for d in dataset])

In [37]:
for d in dataset:
    user,item = d[0], d[1]
    itemsPerUser[user].add(item)
    usersPerItem[item].add(user)

1. Popularity and Jaccard similarity

In [38]:
def build_validate_set1(dataset):
    validate_set=[]
    random.seed(50)
    for d in dataset:
        positive_entry=[d[0],d[1],1]
        negative_entry_item_set=itemSet.difference(itemsPerUser[d[0]])
        random_item=random.choice(list(negative_entry_item_set))
        negative_entry=[d[0],random_item,0]
        validate_set.append(positive_entry)
        validate_set.append(negative_entry)
    return validate_set

In [39]:
def build_train_set1(dataset):
    train_set=[]
    for d in dataset:
        positive_entry=[d[0],d[1]]
        train_set.append(positive_entry)
    return train_set

In [None]:
train_set=build_train_set(dataset[:400000])
validate_set=build_validate_set(dataset[400000:500000])

In [None]:
def jaccardPopularityModel(train_set, test_set, jt=0.01, pt=0.6):
    itemsPerUser = defaultdict(set)
    usersPerItem=defaultdict(set)
    for d in train_set:
        user,item = d[0], d[1]
        itemsPerUser[user].add(item)
        usersPerItem[item].add(user)
    
    itemSet=set([d[1] for d in train_set])
    userSet=set([d[0] for d in train_set])
    
    # calculate average number of recipes made in the train_set
    averageNum=len(train_set)/len(userSet)

    # calculate most popular set in train_set
    recipeCount = defaultdict(int)
    totalCooked = 0
    for d in train_set:
        recipeCount[d[1]] += 1
        totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked*pt: break

    # evalute on test_set
    total_size=len(test_set)
    correct_size=0
    for i in range(total_size):
        sample=test_set[i]
        item=sample[1]
        user=sample[0]
        predict=0
        if item in return1:
            predict=1
#         # if we have not met this user before, just recommend the most popular
#         if user not in userSet:
#             if item in return1:
#                 predict=1
#         else:
#             # if we have not met this recipe before, if the user made many recipes before, then recommend
#             if item not in itemSet:
#                 if len(itemsPerUser[user])>averageNum:
#                     predict=1
#             else:     
#                 maxSim=0
#                 for d in itemsPerUser[user]:
#                     sim=Jaccard(usersPerItem[d],usersPerItem[item])
#                     maxSim=max(maxSim,sim)
#                 if maxSim>jt and item in return1:
#                     predict=1
                
        if predict==sample[2]:
            correct_size+=1
    accuracy=correct_size/total_size
    print([accuracy,jt,pt])

In [None]:
jaccardPopularityModel(train_set,validate_set,0.01,0.6)

In [None]:
for i in range(50):
    jaccardPopularityModel(train_set,validate_set,0.01,0.59+i*0.001)

Upload to kaggle:

In [None]:
def jaccardPopularityModel(train_set, jt=0.01, pt=0.6):
    print("predicting....")
    itemsPerUser = defaultdict(set)
    usersPerItem=defaultdict(set)
    for d in train_set:
        user,item = d[0], d[1]
        itemsPerUser[user].add(item)
        usersPerItem[item].add(user)
    
    itemSet=set([d[1] for d in train_set])
    userSet=set([d[0] for d in train_set])
    
    # calculate average number of recipes made in the train_set
    averageNum=len(train_set)/len(userSet)

    # calculate most popular set in train_set
    recipeCount = defaultdict(int)
    totalCooked = 0
    for d in train_set:
        recipeCount[d[1]] += 1
        totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked*pt: break

    predictions = open("predictions_Made.txt", 'w')
    for l in open("stub_Made.txt"):
        if l.startswith("user_id"):
            predictions.write(l)
            continue
        user,item = l.strip().split('-')
        predict=0
        
        # if we have not met this user before, just recommend the most popular
        if user not in userSet:
            if item in return1:
                predict=1
        else:
            # if we have not met this recipe before, if the user made many recipes before, then recommend
            if item not in itemSet:
                if len(itemsPerUser[user])>averageNum:
                    predict=1
            else:
                if item in return1:
                    predict=1
                else:
                    maxSim=0
                    for d in itemsPerUser[user]:
                        sim=Jaccard(usersPerItem[d],usersPerItem[item])
                        maxSim=max(maxSim,sim)
                    if maxSim>jt:
                        predict=1
        if predict==1:
            predictions.write(user + '-' + item + ",1\n")
        else:
            predictions.write(user + '-' + item + ",0\n")
    predictions.close()
    print("predicting finished!")

In [None]:
train_set=build_train_set(dataset)

In [None]:
jaccardPopularityModel(train_set,0.5,0.6)

2. Classification

In [8]:
import sklearn
from sklearn import linear_model

In [9]:
def build_train_set(dataset):
    itemsPerUser = defaultdict(set)
    usersPerItem=defaultdict(set)
    for d in dataset:
        user,item = d[0], d[1]
        itemsPerUser[user].add(item)
        usersPerItem[item].add(user)
    train_set=[]
    random.seed(50)
    for d in dataset:
        positive_entry=[d[0],d[1],1]
        negative_entry_item_set=itemSet.difference(itemsPerUser[d[0]])
        random_item=random.choice(list(negative_entry_item_set))
        negative_entry=[d[0],random_item,0]
        train_set.append(positive_entry)
        train_set.append(negative_entry)
    return train_set

In [10]:
train_set=build_train_set(dataset)

In [11]:
itemsPerUser = defaultdict(set)
usersPerItem=defaultdict(set)
for d in train_set:
    user,item = d[0], d[1]
    itemsPerUser[user].add(item)
    usersPerItem[item].add(user)

itemSet=set([d[1] for d in train_set])
userSet=set([d[0] for d in train_set])

In [12]:
# calculate most popular set in train_set
recipeCount = defaultdict(int)
totalCooked = 0
for d in train_set:
    recipeCount[d[1]] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalCooked*0.6: break

In [27]:
def feature(user, item):
    feat = [1]
    if item in return1:
        feat.append(1)
    else:
        feat.append(0)    
    maxSim=0
    for d in itemsPerUser[user]:
        sim=Jaccard(usersPerItem[d],usersPerItem[item])
        maxSim=max(maxSim,sim)
    feat.append(maxSim-0.01)
    return feat

In [28]:
X=[]
y=[]

In [29]:
for d in train_set:
    user=d[0]
    item=d[1]
    X.append(feature(user,item))
    y.append(d[2]==1)

KeyboardInterrupt: 

In [None]:
model = sklearn.linear_model.LogisticRegression()
model.fit(X, y)

In [None]:
X_test=[]

In [None]:
predictions = open("predictions_Made.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        predictions.write(l)
        continue
    user,item = l.strip().split('-')
    X_test.append(feature(user,item))
y_predict=model.predict(X_test)

i=0
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        continue
    user,item = l.strip().split('-')
    predict=y_predict[i]
    if predict==True:
        predictions.write(user + '-' + item + ",1\n")
    else:
        predictions.write(user + '-' + item + ",0\n")
    i+=1
predictions.close()
print("predicting finished!")