In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy
import urllib
import scipy.optimize
import random
from sklearn import linear_model
import gzip
from collections import defaultdict

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [None]:
f = open("drive/MyDrive/CSE 258/polish+companies+bankruptcy+data/5year.arff", 'r')

In [None]:
# Read and parse the data
while not '@data' in f.readline():
    pass

dataset = []
for l in f:
    if '?' in l: # Missing entry
        continue
    l = l.split(',')
    values = [1] + [float(x) for x in l]
    values[-1] = values[-1] > 0 # Convert to bool
    dataset.append(values)

In [None]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
answers = {} # Your answers

In [None]:
def accuracy(predictions, y):
  accuracy = accuracy_score(y, predictions)
  return accuracy

In [None]:
def BER(predictions, y):
  cm = confusion_matrix(y, predictions)
  tn, fp, fn, tp = cm.ravel()
  fpr = fp / (fp + tn)
  fnr = fn / (fn + tp)
  ber = 0.5 * (fpr + fnr)
  return ber

In [None]:
### Question 1

In [None]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(X,y)

pred = mod.predict(X)

In [None]:
acc1 = accuracy(pred, y)
ber1 = BER(pred, y)

In [None]:
answers['Q1'] = [acc1, ber1] # Accuracy and balanced error rate

In [None]:
assertFloatList(answers['Q1'], 2)

In [None]:
### Question 2

In [None]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(X,y)

pred = mod.predict(X)

In [None]:
acc2 = accuracy(pred, y)
ber2 = BER(pred, y)

In [None]:
answers['Q2'] = [acc2, ber2]

In [None]:
assertFloatList(answers['Q2'], 2)

In [None]:
### Question 3

In [None]:
random.seed(3)
random.shuffle(dataset)

In [None]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [None]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [None]:
len(Xtrain), len(Xvalid), len(Xtest)

(1515, 758, 758)

In [None]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(Xtrain,ytrain)

predTrain = mod.predict(Xtrain)
predValid = mod.predict(Xvalid)
predTest = mod.predict(Xtest)

In [None]:
berTrain = BER(predTrain, ytrain)
berValid = BER(predValid, yvalid)
berTest = BER(predTest, ytest)

In [None]:
answers['Q3'] = [berTrain, berValid, berTest]

In [None]:
assertFloatList(answers['Q3'], 3)

In [None]:
### Question 4

In [None]:
berList = []

In [None]:
for i in range(-4,5):
  mod = linear_model.LogisticRegression(C=10**i, class_weight='balanced')
  mod.fit(Xtrain,ytrain)

  predValid = mod.predict(Xvalid)
  berValid = BER(predValid, yvalid)
  berList.append(berValid)

In [None]:
answers['Q4'] = berList

In [None]:
assertFloatList(answers['Q4'], 9)

In [None]:
### Question 5

In [None]:
import numpy as np
bestC = 10**(np.argmin(berList)-4)
ber5 = np.min(berList)

In [None]:
answers['Q5'] = [bestC, ber5]

In [None]:
assertFloatList(answers['Q5'], 2)

In [None]:
### Question 6

In [None]:
f = gzip.open("drive/MyDrive/CSE 258/young_adult_10000.json.gz")
dataset = []
for l in f:
  dataset.append(eval(l))

In [None]:
dataTrain = dataset[:9000]
dataTest = dataset[9000:]

In [None]:
# Some data structures you might want

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {} # To retrieve a rating for a specific user/item pair
ave_rating = 0

for d in dataTrain:
  usersPerItem[d['book_id']].add(d['user_id'])
  itemsPerUser[d['user_id']].add(d['book_id'])
  reviewsPerUser[d['user_id']].append(d['rating'])
  reviewsPerItem[d['book_id']].append(d['rating'])
  ratingDict[(d['user_id'],d['book_id'])] = d['rating']
  ave_rating += d['rating']

ave_rating /= len(dataTrain)

In [None]:
def Jaccard(s1, s2):
  intersection = len(s1.intersection(s2))
  union = len(s1.union(s2))
  jaccard_similarity = intersection / union
  return jaccard_similarity

In [None]:
def mostSimilar(i, N):
  js = []
  id = []
  for key, value_set in usersPerItem.items():
    id.append(key)
    js.append(Jaccard(value_set, usersPerItem[i]))
  sorted_js = sorted(enumerate(js), key=lambda x: x[1], reverse=True)
  ms = []
  for i in range(1,N+1):
    ms.append([sorted_js[i][1],id[sorted_js[i][0]]])
  return ms

In [None]:
answers['Q6'] = mostSimilar('2767052', 10)

In [None]:
assert len(answers['Q6']) == 10
assertFloatList([x[0] for x in answers['Q6']], 10)

In [None]:
### Question 7

In [None]:
def r(u,i):
  if len(reviewsPerItem[i]) == 0 and len(reviewsPerUser[u]) == 0:
    return ave_rating
  elif len(reviewsPerItem[i]) == 0:
    return sum(reviewsPerUser[u])/len(reviewsPerUser[u])
  else:
    ave_i = sum(reviewsPerItem[i])/len(reviewsPerItem[i])

  sum1 = 0
  sum2 = 0
  for j in itemsPerUser[u]:
    if j != i:
      ave_j = sum(reviewsPerItem[j])/len(reviewsPerItem[j])
      sum1 += (ratingDict[(u,j)] - ave_j) * (Jaccard(usersPerItem[i], usersPerItem[j]))
      sum2 += Jaccard(usersPerItem[i], usersPerItem[j])
  if sum2 == 0:
    return ave_i
  else:
    return ave_i + sum1/sum2

In [None]:
ratings_pred = []
ratings = []
for d in dataTest:
  ratings_pred.append(r(d['user_id'],d['book_id']))
  ratings.append(d['rating'])

In [None]:
from sklearn.metrics import mean_squared_error
mse7 = mean_squared_error(ratings_pred,ratings)

In [None]:
answers['Q7'] = mse7

In [None]:
assertFloat(answers['Q7'])

In [None]:
### Question 8

In [None]:
def r(u,i):
  if len(reviewsPerItem[i]) == 0 and len(reviewsPerUser[u]) == 0:
    return ave_rating
  elif len(reviewsPerUser[u]) == 0:
    return sum(reviewsPerItem[i])/len(reviewsPerItem[i])
  else:
    ave_u = sum(reviewsPerUser[u])/len(reviewsPerUser[u])

  sum1 = 0
  sum2 = 0
  for v in usersPerItem[i]:
    if v != u:
      ave_v = sum(reviewsPerUser[v])/len(reviewsPerUser[v])
      sum1 += (ratingDict[(v,i)] - ave_v) * (Jaccard(itemsPerUser[u], itemsPerUser[v]))
      sum2 += Jaccard(itemsPerUser[v], itemsPerUser[v])
  if sum2 == 0:
    return ave_u
  else:
    return ave_u + sum1/sum2

In [None]:
ratings_pred = []
ratings = []
for d in dataTest:
  ratings_pred.append(r(d['user_id'],d['book_id']))
  ratings.append(d['rating'])

In [None]:
mse8 = mean_squared_error(ratings_pred,ratings)

In [None]:
answers['Q8'] = mse8

In [None]:
assertFloat(answers['Q8'])

In [None]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()