In [51]:
import numpy as np
import urllib
import networkx as nx
import matplotlib.pyplot as plt
from homework2_starter import *

# Classifier evaluation

In [11]:
def split_data(X, Y, val_ratio, test_ratio, shuffle=False):
    m = X.shape[0]
    
    m_train = int(m * (1 - val_ratio - test_ratio))
    m_val = int(m * val_ratio) + m_train
    m_test = int(m * test_ratio) + m_val
    
    if shuffle:
        permutation = np.random.permutation(m)
        X = X[permutation, :]
        Y = Y[permutation, :]
        
    return X[:m_train, :], Y[:m_train, :], X[m_val:, :], Y[m_val:, :], X[m_test:, :], Y[m_test:, :]

def evaluation(theta, X, y):
    scores = [inner(theta,x) for x in X]
    predictions = [s > 0 for s in scores]
    correct = [(a==b) for (a,b) in zip(predictions,y)]
    acc = sum(correct) * 1.0 / len(correct)
    return acc

In [6]:
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))
X = [feature(d) for d in data]
y = [d['beer/ABV'] >= 6.5 for d in data]

X = np.array(X)
y = np.array(y).reshape(-1, 1)

In [8]:
X_train, y_train, X_val, y_val, X_test, y_test = split_data(X, y, 1/3, 1/3, True)

In [15]:
lam = 1.
theta = train(lam)
acc = evaluation(theta, X_val, y_val)
print("Validation set accuracy=%f" % acc)
acc = evaluation(theta, X_test, y_test)
print("Test set accuracy=%f" % acc)

Validation set accuracy=0.719642
Test set accuracy=0.500000


In [16]:
lambdas = [0, 0.01, 0.1, 1, 100]
best_lam = None
best_acc = {}

for lam in lambdas:
    theta = train(lam)
    val_acc = evaluation(theta, X_val, y_val)
    if best_lam is None or best_acc['val'] < val_acc:
        best_lam = lam
        best_acc['train'] = evaluation(theta, X_train, y_train)
        best_acc['val'] = val_acc
        best_acc['test'] = evaluation(theta, X_test, y_test)
        
print('Best lambda = %f' % best_lam)
print('Train acc=%f \tValidation acc=%f \tTest acc=%f' % (best_acc['train'], best_acc['val'], best_acc['test']))

Best lambda = 0.000000
Train acc=0.719129 	Validation acc=0.719882 	Test acc=0.500000


# Community Detection

In [16]:
edges = set()
nodes = set()
for edge in urllib.request.urlopen("http://jmcauley.ucsd.edu/cse255/data/facebook/egonet.txt"):
    x,y = edge.split()
    x,y = int(x),int(y)
    edges.add((x,y))
    edges.add((y,x))
    nodes.add(x)
    nodes.add(y)

In [53]:
G = nx.Graph()
for e in edges:
    G.add_edge(e[0],e[1])

In [17]:
def dfs(node):
    if node in visited:
        return
    comp.add(node)
    visited.add(node)
    for edge in edges:
        if node == edge[0]:
            dfs(edge[1])

In [31]:
visited = set()
comps = []
comps_len = []

for node in nodes:
    comp = set()
    dfs(node)
    if len(comp) > 0:
        comps.append(comp)
        comps_len.append(len(comp))
        
print('largest connected component contains %d nodes' % max(comps_len))

largest connected component contains 40 nodes


In [39]:
comp = comps[comps_len.index(max(comps_len))]

In [50]:
comp = list(comp)
comp = sorted(comp)
s1, s2 = comp[:len(comp)//2], comp[len(comp)//2:]

cost = 0
for edge in edges:
    x, y = edge
    if x in s1 and y in s2:
        cost += 1
        
cost

92

In [58]:
node_cost = []
def greedy(s1, s2):
    for node in nodes:
        if node not in s1 or node not in s2: continue
        t1 = set(s1)
        t2 = set(s2)
        if node in s1:
            t1.remove(node)
            t2.add(node)
        else:
            t1.add(node)
            t2.remove(node)
            
        node_cost.append((node, cut_cost(t1, t2)))
            

In [57]:
def cut_cost(s1, s2):
    pass

In [59]:
def modularity():
    pass