In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler,scale
import cPickle

In [3]:
TRAINING_RATE = 0.8
TESTING_RATE = 1 - TRAINING_RATE
MISSING_RATE = 0.5
QUERY_RATE = 0.2
np.random.seed(10)

In [4]:
data = pd.read_csv("../dat/winequality.csv",sep=';')

In [5]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [6]:
data['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [None]:
def preprocessing(d):
    #Add normalization code here if necessary
    min_max_scaler = MinMaxScaler()
    #d.ix[:,0:-1] = scale(d.ix[:,0:-1])
    d.ix[:,0:-1] = min_max_scaler.fit_transform(d.ix[:,0:-1])
    #d['quality'] = d['quality'].apply(lambda x:1.0 if x==6 else 0.0)
    d['quality'] = d['quality'].apply(lambda x: int(x) -3)
    d = d.iloc[np.random.permutation(len(d))]
    t = int(len(d) * TRAINING_RATE)
    tn_data = d.iloc[0:t,:]
    tt_data = d.iloc[t:,:]
    
    tn_X = tn_data.ix[:,0:-1]
    tn_Y = tn_data.ix[:,-1]
    tt_X = tt_data.ix[:,0:-1]
    tt_Y = tt_data.ix[:,-1]
    
    
    return tn_X,tn_Y, tt_X, tt_Y

In [None]:
def TestMissingGenerate(tt_data,mr=MISSING_RATE):
    missing_entry = []
    row , col= range(tt_data.shape[0]),range(tt_data.shape[1])
    
    while len(missing_entry) < tt_data.shape[0] * tt_data.shape[1] * mr:
        r = np.random.choice(row)
        c = np.random.choice(col)
        
        if (r,c) not  in missing_entry:
            missing_entry.append((r,c))
        else:
            continue
    
    for me in missing_entry:
        tt_data.set_value(me[0],tt_data.columns[me[1]],np.nan)
    return tt_data, missing_entry

In [None]:
def Imputation(d,missing):
    d = d.fillna(0.0)
    model = NMF(n_components=3, init='random', random_state=0)
    model.fit(d)
    H = model.components_
    W = model.fit_transform(d)
    d_prime = W.dot(H)
    
    d = d.values
    for m in missing:
        #d[m[0]][m[1]] = d_prime[m[0]][m[1]]
        d[m[0]][m[1]] = 0.0
    return d

In [None]:
tn_X,tn_Y,tt_X,tt_Y = preprocessing(data)

In [None]:
tn_X.head()

In [None]:
tn_Y.value_counts()

In [None]:
#tt_X.head()

In [None]:
#set(tn_X.index).intersection(set(tt_X.index))

In [None]:
tn_X = tn_X.reset_index(drop=True)
tt_X = tt_X.reset_index(drop=True)
tt_oracle = tt_X.copy()

In [None]:
#tt_oracle.head()

In [None]:
tt_X, missing_entry = TestMissingGenerate(tt_X)

In [None]:
tt_imp = Imputation(tt_X,missing_entry)

In [None]:
pd.DataFrame(tt_imp).head(15)

In [None]:
tt_oracle.head(10)

In [None]:
clf = LogisticRegression(C=1)
clf.fit(tn_X,tn_Y)

In [None]:
sum(clf.predict(tn_X) == tn_Y) / float(len(tn_Y))

In [None]:
sum(clf.predict(tt_imp) == tt_Y) / float(len(tt_Y))

In [None]:
def RandomQuery(m):
    q = m[np.random.randint(0,len(m))]
    return q

In [None]:
def NaiveHeuristic(model=clf,missing=missing_entry,data=tt_imp):
    
    predict_prob = model.predict_proba(data)
    predict = model.predict(data)
    coef = model.coef_
    certainty = []
    for p in predict_prob:
        tmp = sorted(p,reverse=True)
        certainty.append(tmp[0]-tmp[1])
    q_r = np.argsort(certainty)

    for r in q_r: 
        weight = np.argsort(np.absolute(coef[predict[r]]))[::-1]
        for w in weight:
            if (r,w) in missing:
                return (r,w)    

In [None]:
query_num = 0
query_budget = tt_imp.shape[0] * tt_imp.shape[1] * MISSING_RATE * QUERY_RATE

res_naive = []
while query_num < query_budget:
    
    #Random query
    #query = RandomQuery(missing_entry)
    query = NaiveHeuristic()
    missing_entry.remove(query)
    
    tt_imp[query[0]][query[1]] = tt_oracle.ix[query[0],query[1]]
    
    if query_num % 100 ==0:
        #print "#of Query" + str(query_num) + " Query entry: "
        tmp_r = sum(clf.predict(tt_imp) == tt_Y)/ float(len(tt_imp))
        print tmp_r
        res_naive.append(tmp_r)
    query_num += 1

In [None]:
0.339795918367
0.334693877551
0.338775510204
0.338775510204
0.338775510204
0.336734693878
0.340816326531
0.34387755102
0.351020408163
0.35
0.345918367347
0.345918367347
0.348979591837
0.351020408163
0.354081632653
0.357142857143
0.359183673469
0.363265306122

In [None]:
0.464285714286
0.463265306122
0.466326530612
0.469387755102
0.473469387755
0.478571428571
0.477551020408

In [None]:
cPickle.dump(res_naive,open("naive_30.pkl",'w'))

In [None]:
#0.8 missing rate 0.3 query rate 
#of Query0 Query entry: 
0.339795918367
#of Query100 Query entry: 
0.334693877551
#of Query200 Query entry: 
0.338775510204
#of Query300 Query entry: 
0.338775510204
#of Query400 Query entry: 
0.338775510204
#of Query500 Query entry: 
0.336734693878
#of Query600 Query entry: 
0.340816326531
#of Query700 Query entry: 
0.34387755102
#of Query800 Query entry: 
0.351020408163
#of Query900 Query entry: 
0.35
#of Query1000 Query entry: 
0.345918367347
#of Query1100 Query entry: 
0.345918367347
#of Query1200 Query entry: 
0.348979591837
#of Query1300 Query entry: 
0.351020408163
#of Query1400 Query entry: 
0.354081632653
#of Query1500 Query entry: 
0.357142857143
#of Query1600 Query entry: 
0.359183673469
#of Query1700 Query entry: 
0.363265306122