In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from sklearn.preprocessing import MinMaxScaler
from implicit import bpr
import csv
# from surprise import SVD, Reader, Dataset
# from surprise.model_selection import train_test_split

In [2]:
def readCSV(path):
    with open(path, mode='r', newline='', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            yield row


# modcloth dataset visualization

In [3]:
modcloth_test = []
modcloth_train = []
modcloth_valid = []
for l in readCSV("./data/modcloth_final_data_processed_test.csv"):
    modcloth_test.append(l)
for l in readCSV("./data/modcloth_final_data_processed_train.csv"):
    modcloth_train.append(l)
for l in readCSV("./data/modcloth_final_data_processed_valid.csv"):
    modcloth_valid.append(l)

In [4]:
modcloth_test[1]

{'waist': '-0.045096139515121966',
 'size': '1.61249466382741',
 'quality': '-0.9563970724487967',
 'hips': '1.2569653494085655',
 'bra_size': '0.6523473985854076',
 'height': '0.30634343084759313',
 'shoe_size': '-0.06312455609563944',
 'item_id': '41',
 'category': '2',
 'cup_size': '4',
 'user_id': '24985',
 'fit': '1'}

In [5]:
modcloth_train_base = [(d['user_id'], d['item_id'], d['fit']) for d in modcloth_train]
modcloth_test_base = [(d['user_id'], d['item_id'], d['fit']) for d in modcloth_test]
modcloth_valid_base = [(d['user_id'], d['item_id'], d['fit']) for d in modcloth_valid]


In [6]:
modcloth_train_base[0]
# fit == 1; 0: small, 2: large

('21754', '738', '1')

## 1-similarity

In [7]:
userItems = defaultdict(set) #user -> collection of item bought
itemUsers = defaultdict(set) #item -> collection of users who bought the item

In [8]:
for d in modcloth_train_base:
    user, item = d[0], d[1]
    userItems[user].add(item)
    itemUsers[item].add(user)

In [9]:
import itertools
first_three_groups = dict(itertools.islice(itemUsers.items(), 3))

for group, values in first_three_groups.items():
    print(f"{group}: {values}")

738: {'22506', '43022', '14127', '10266', '16077', '24107', '14763', '8032', '33093', '23031', '43135', '19120', '1396', '10293', '5789', '45178', '46596', '5163', '3849', '26370', '42764', '45287', '30372', '17514', '38298', '11738', '36289', '32191', '32065', '28846', '31585', '25926', '43414', '44945', '17125', '13675', '25918', '2386', '26339', '15035', '18176', '12107', '27465', '18458', '8188', '16448', '23067', '34879', '12150', '33385', '9305', '25075', '37083', '15030', '20891', '44035', '23970', '27664', '20853', '15796', '13543', '18448', '15860', '41287', '36130', '15822', '40517', '4488', '31033', '34543', '10751', '6699', '34890', '39955', '32100', '10611', '16942', '3891', '45683', '13173', '13989', '10197', '20277', '16966', '18140', '462', '47453', '16620', '39290', '40912', '23221', '7385', '7597', '2609', '11976', '23972', '15356', '263', '30041', '43631', '1994', '6977', '19087', '3837', '33668', '18718', '20458', '36204', '16718', '16060', '29848', '33490', '20791'

In [10]:
# Jaccard similarity function
def jaccard_similarity(list1, list2):
    intersection = len(set(list1).intersection(set(list2)))
    union = len(set(list1).union(set(list2)))
    return intersection / union

In [11]:
def predictionJaccard(data, userItems, itemUsers, threshold):
    pred = []
    cnt = 0
    for user,item,_ in data:
        # print(user)
        cnt += 1
        similarities = []
        for i in userItems[user]:
            # print(cnt)
            if item == i: continue
            similarities.append(jaccard_similarity(itemUsers[item], itemUsers[i]))
        max_sim = max(similarities+[0])
        #print(max_sim)
        if max_sim > threshold:
            pred.append(1)
        else:
            pred.append(0)
        #print(pred)
    return pred

In [12]:
pred = predictionJaccard(modcloth_valid_base,userItems, itemUsers, threshold=0.1)

In [13]:
best_accuracy = 0
best_threshold = 0

In [14]:
thresholds = [i / 1000 for i in range(1, 1000)]

In [15]:
labels = [1 if l[-1] == '1' else 0 for l in modcloth_valid_base]

In [16]:
for threshold in thresholds:
    pred = predictionJaccard(modcloth_valid_base, userItems, itemUsers,threshold)
    accurate = [pred == label for pred, label in zip(pred,labels)]
    accuracy = sum(accurate)/len(accurate)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

In [17]:
best_threshold

0.001

In [18]:
best_accuracy

0.506099770503684

In [19]:
test_pred = predictionJaccard(modcloth_test_base, userItems, itemUsers,0.001)

In [20]:
test_lables = [1 if l[-1] == '1' else 0 for l in modcloth_test_base]

In [21]:
f = open("actual_label_jaccard_modcloth.txt", 'a')
f.write(str(test_lables) + '\n')
f.close()

In [22]:
f = open("predict_label_jaccard_modcloth.txt", 'a')
f.write(str(test_pred) + '\n')
f.close()

In [23]:
accurate = [test_pred == label for test_pred, label in zip(test_pred,test_lables)]
accuracy = sum(accurate)/len(accurate)

In [24]:
accuracy

0.5062205580384105

## 2-popularity

In [25]:
itemCount = defaultdict(int)
totalBought = 0

for user, item, _ in modcloth_train_base:
    itemCount[item] += 1
    totalBought += 1

In [26]:
mostPopular = [(itemCount[x], x) for x in itemCount] #一个列表，包含所有游戏及其被玩次数的元组
mostPopular.sort()
mostPopular.reverse() #排序后的列表反转，使得最受欢迎的游戏排在最前面

In [36]:
modcloth_valid_base[:10]

[('46781', '763', '0'),
 ('2134', '1080', '1'),
 ('25523', '571', '1'),
 ('34468', '922', '1'),
 ('19934', '1151', '1'),
 ('12236', '602', '1'),
 ('20978', '611', '1'),
 ('25723', '68', '0'),
 ('47074', '95', '1'),
 ('26393', '432', '2')]

In [37]:
def prepare_features(modcloth_valid_base, totalBought, mostPopular, userItems, itemUsers):
    X = []  # 特征数组
    Y = []  # 标签数组

    for user, item, fit in modcloth_valid_base:
        features = []

        # 特征1: 流行度（游戏被玩的次数占总次数的比例）
        item_popularity = itemCount[item] / totalBought
        features.append(item_popularity)

        
        X.append(features)
        Y.append(1 if int(fit) == 1 else 0)

    return X, Y

In [38]:
X_train, Y_train = prepare_features(modcloth_valid_base, totalBought, mostPopular, userItems, itemUsers)

In [39]:
# 使用逻辑回归模型
model = linear_model.LogisticRegression()
model.fit(X_train, Y_train)

In [40]:
# 对验证集进行预测
pred = model.predict(X_train)

# 计算准确率
correct_predictions = [p == y for p, y in zip(pred, Y_train)]
accuracy = sum(correct_predictions) / len(correct_predictions)

In [41]:
# 获取模型的系数（权重）
coefficients = model.coef_

# 获取模型的截距（偏差）
intercept = model.intercept_

# 打印这些参数
print("Coefficients:", coefficients)
print("Intercept:", intercept)


Coefficients: [[1.25817983]]
Intercept: [0.78000363]


In [42]:
f = open("actual_label_popularity_modcloth.txt", 'a')
f.write(str(Y_train) + '\n')
f.close()

In [43]:
f = open("predicted_label_popularity_modcloth.txt", 'a')
f.write(str(test_lables) + '\n')
f.close()

In [44]:
accuracy

0.6876434351974876

# renttherunway dataset visualization

In [45]:
renttherunway_test = []
renttherunway_train = []
renttherunway_valid = []
for l in readCSV("./data/renttherunway_final_data_processed_test.csv"):
    renttherunway_test.append(l)
for l in readCSV("./data/renttherunway_final_data_processed_train.csv"):
    renttherunway_train.append(l)
for l in readCSV("./data/renttherunway_final_data_processed_valid.csv"):
    renttherunway_valid.append(l)

In [46]:
renttherunway_train_base = [(d['user_id'], d['item_id'], d['fit']) for d in renttherunway_train]
renttherunway_test_base = [(d['user_id'], d['item_id'], d['fit']) for d in renttherunway_test]
renttherunway_valid_base = [(d['user_id'], d['item_id'], d['fit']) for d in renttherunway_valid]


## 1-Similarity

In [47]:
userItems2 = defaultdict(set) #user -> collection of item bought
itemUsers2 = defaultdict(set) #item -> collection of users who bought the item

In [48]:
for d in renttherunway_train_base:
    user, item = d[0], d[1]
    userItems2[user].add(item)
    itemUsers2[item].add(user)

In [49]:
# Jaccard similarity function
def jaccard_similarity(list1, list2):
    intersection = len(set(list1).intersection(set(list2)))
    union = len(set(list1).union(set(list2)))
    return intersection / union

In [50]:
def predictionJaccard(data, userItems2, itemUsers2, threshold):
    pred = []
    cnt = 0
    for user,item,_ in data:
        # print(user)
        cnt += 1
        similarities = []
        for i in userItems2[user]:
            # print(cnt)
            if item == i: continue
            similarities.append(jaccard_similarity(itemUsers2[item], itemUsers2[i]))
        max_sim = max(similarities+[0])
        #print(max_sim)
        if max_sim > threshold:
            pred.append(1)
        else:
            pred.append(0)
        #print(pred)
    return pred

In [53]:
best_accuracy2 = 0
best_threshold2= 0

In [54]:
thresholds2 = [i / 1000 for i in range(1, 100)]

In [55]:
labels2 = [1 if l[-1] == '1' else 0 for l in renttherunway_valid_base]

In [59]:
for threshold2 in thresholds2:
    pred2 = predictionJaccard(renttherunway_valid_base, userItems2, itemUsers2,threshold2)
    accurate = [pred2 == label2 for pred2, label2 in zip(pred2,labels2)]
    accuracy = sum(accurate)/len(accurate)
    if accuracy > best_accuracy2:
        best_accuracy2 = accuracy
        best_threshold2 = threshold

In [61]:
best_threshold2

0.001

In [62]:
best_accuracy2

0.35855

In [63]:
test_pred2 = predictionJaccard(renttherunway_test_base, userItems2, itemUsers2,0.001)

In [64]:
test_lables2 = [1 if l[-1] == '1' else 0 for l in renttherunway_test_base]

In [65]:
accurate = [test_pred2 == label2 for test_pred2, label2 in zip(test_pred2,test_lables2)]
accuracy = sum(accurate)/len(accurate)

In [66]:
f = open("actual_label_similarit_renttherunway.txt", 'a')
f.write(str(test_lables2) + '\n')
f.close()

In [67]:
f = open("predicted_label_similarit_renttherunway.txt", 'a')
f.write(str(test_pred2) + '\n')
f.close()

In [68]:
accuracy

0.3629941217710187

## 2-Popularity

In [69]:
itemCount2 = defaultdict(int)
totalBought2 = 0

for user, item, _ in renttherunway_train_base:
    itemCount2[item] += 1
    totalBought2 += 1

In [70]:
mostPopular2 = [(itemCount2[x], x) for x in itemCount2] #一个列表，包含所有游戏及其被玩次数的元组
mostPopular2.sort()
mostPopular2.reverse() #排序后的列表反转，使得最受欢迎的游戏排在最前面

In [71]:
def prepare_features(renttherunway_valid_base, totalBought2, mostPopular2, userItems2, itemUsers2):
    X = []  # 特征数组
    Y = []  # 标签数组

    for user, item, fit in renttherunway_valid_base:
        features = []

        # 特征1: 流行度（游戏被玩的次数占总次数的比例）
        item_popularity = itemCount[item] / totalBought
        features.append(item_popularity)

        
        X.append(features)
        Y.append(int(fit))

    return X, Y

In [72]:
X_train2, Y_train2 = prepare_features(renttherunway_valid_base, totalBought2, mostPopular2, userItems2, itemUsers2)

In [73]:
# 使用逻辑回归模型
model2 = linear_model.LogisticRegression()
model2.fit(X_train2, Y_train2)

In [74]:
# 对验证集进行预测
pred2 = model2.predict(X_train2)

# 计算准确率
correct_predictions = [p == y for p, y in zip(pred2, Y_train2)]
accuracy = sum(correct_predictions) / len(correct_predictions)

In [75]:
# 获取模型的系数（权重）
coefficients2 = model2.coef_

# 获取模型的截距（偏差）
intercept2 = model2.intercept_

# 打印这些参数
print("Coefficients:", coefficients2)
print("Intercept:", intercept2)


Coefficients: [[0.28364886]]
Intercept: [1.03857672]


In [76]:
f = open("actual_label_popularity_renttherunway.txt", 'a')
f.write(str(Y_train2) + '\n')
f.close()

In [77]:
f = open("predicted_label_popularity_renttherunway.txt", 'a')
f.write(str(pred2) + '\n')
f.close()

In [78]:
accuracy

0.7386