In [228]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn import linear_model
import numpy
import random
import gzip
import math

In [229]:
import warnings
warnings.filterwarnings("ignore")

In [230]:
def assertFloat(x): # Checks that an answer is a float
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [231]:
# f = gzip.open("young_adult_10000.json.gz")
f = gzip.open("drive/MyDrive/CSE 258/young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [233]:
answers = {} # Put your answers to each question in this dictionary

In [None]:
### Question 1

In [235]:
def feature(datum):
  # your implementation
  x = []
  y = []
  for d in datum:
    x.append(d['review_text'].count('!'))
    y.append(d['rating'])
  return numpy.array(x).reshape(-1, 1), y

In [236]:
X,Y = feature(dataset)
reg = linear_model.LinearRegression().fit(X,Y)
y = reg.predict(X)
from sklearn.metrics import mean_squared_error
theta0 = reg.intercept_
theta1 = reg.coef_[0]
mse = mean_squared_error(y,Y)

In [237]:
answers['Q1'] = [theta0, theta1, mse]

In [238]:
assertFloatList(answers['Q1'], 3) # Check the format of your answer (three floats)

In [None]:
### Question 2

In [239]:
def feature(datum):
  x = []
  y = []
  for d in datum:
    x.append([len(d['review_text']),d['review_text'].count('!')])
    y.append(d['rating'])
  return numpy.array(x), y

In [240]:
X,Y = feature(dataset)
reg = linear_model.LinearRegression().fit(X,Y)
y = reg.predict(X)
from sklearn.metrics import mean_squared_error
theta0 = reg.intercept_
theta1 = reg.coef_[0]
theta2 = reg.coef_[1]
mse = mean_squared_error(y,Y)

In [242]:
answers['Q2'] = [theta0, theta1, theta2, mse]

In [243]:
assertFloatList(answers['Q2'], 4)

In [None]:
### Question 3

In [244]:
def feature(datum, deg):
  # feature for a specific polynomial degree
  x = []
  y = []
  for d in datum:
    x_=[d['review_text'].count('!')]
    for i in range(0,deg-1):
      x_.append(x_[0]*x_[i])
    x.append(x_)
    y.append(d['rating'])
    del x_
  return numpy.array(x), y
X,Y = feature(dataset,5)

In [245]:
from sklearn.metrics import mean_squared_error
mses = []
for deg in range(1,6):
  x = X.T[:deg].T
  reg = linear_model.LinearRegression().fit(x,Y)
  y = reg.predict(x)
  mse = mean_squared_error(y,Y)
  mses.append(mse)

In [246]:
answers['Q3'] = mses

In [247]:
assertFloatList(answers['Q3'], 5)# List of length 5

In [127]:
### Question 4

In [248]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=42)

In [249]:
mses = []
for deg in range(1,6):
  reg = linear_model.LinearRegression().fit(X_train.T[:deg].T,y_train)
  y = reg.predict(X_test.T[:deg].T)
  mse = mean_squared_error(y,y_test)
  mses.append(mse)

In [250]:
answers['Q4'] = mses

In [251]:
assertFloatList(answers['Q4'], 5)

In [None]:
### Question 5

In [252]:
from sklearn.metrics import mean_absolute_error
mae = 10000
for deg in range(1,6):
  reg = linear_model.LinearRegression().fit(X_train.T[:deg].T,y_train)
  y = reg.predict(X_test.T[:deg].T)
  mae_ = mean_absolute_error(y,y_test)
  if mae_ < mae:
    mae = mae_

In [253]:
answers['Q5'] = mae

In [254]:
assertFloat(answers['Q5'])

In [None]:
### Question 6

In [255]:
f = open("drive/MyDrive/CSE 258/beer_50000.json")
dataset = []
for l in f:
    if 'user/gender' in l:
        dataset.append(eval(l))

In [256]:
def feature(datum):
  x = []
  y = []
  for d in datum:
    x.append([d['review/text'].count('!')])
    y.append(d['user/gender']=='Female')
  return numpy.array(x).reshape(-1,1), y

In [257]:
X,y = feature(dataset)
reg = linear_model.LogisticRegression().fit(X,y)
Y = reg.predict(X)

In [260]:
TP = 0
TN = 0
FP = 0
FN = 0
for i in range(len(dataset)):
  if Y[i] == y[i]:
    if y[i] == True:
      TP += 1
    else:
      TN += 1
  else:
    if y[i] == True:
      FN += 1
    else:
      FP += 1

BER = (FN/(TP+FN) + FP/(FP+TN)) / 2

In [261]:
answers['Q6'] = [TP, TN, FP, FN, BER]

In [262]:
assertFloatList(answers['Q6'], 5)

In [64]:
### Question 7

In [None]:
X,y = feature(dataset)
reg = linear_model.LogisticRegression(class_weight='balanced').fit(X,y)
Y = reg.predict(X)

In [263]:
TP = 0
TN = 0
FP = 0
FN = 0
for i in range(len(dataset)):
  if Y[i] == y[i]:
    if y[i] == True:
      TP += 1
    else:
      TN += 1
  else:
    if y[i] == True:
      FN += 1
    else:
      FP += 1
BER = BER = (FN/(TP+FN) + FP/(FP+TN)) / 2

In [265]:
answers["Q7"] = [TP, TN, FP, FN, BER]

In [266]:
assertFloatList(answers['Q7'], 5)

In [None]:
### Question 8

In [267]:
K = [1,10,100,1000,10000]
precisionList=[]
TP = 0
FP = 0
j = 0
for i in range(len(dataset)):
  if Y[i] == True:
    if Y[i] == y[i]:
      TP += 1
    else:
      FP += 1
  if i+1 == K[j]:
    if TP + FP ==0:
      precisionList.append(0)
    else:
      precisionList.append(TP/(TP+FP))
    j += 1
    if j == 5:
      break

In [269]:
answers['Q8'] = precisionList

In [270]:
assertFloatList(answers['Q8'], 5) #List of five floats

In [271]:
f = open("drive/MyDrive/CSE 258/answers_hw1.txt", 'w') # Write your answers to a file
f.write(str(answers) + '\n')
f.close()