In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler,scale
# import cPickle

In [4]:
TRAINING_RATE = 0.8
TESTING_RATE = 1 - TRAINING_RATE
MISSING_RATE = 0.5
QUERY_RATE = 0.2
np.random.seed(10)

In [5]:
data = pd.read_csv("../dat/wine_quality/winequality-white.csv",sep=';')

In [6]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [7]:
data['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [8]:
def preprocessing(d):
    #Add normalization code here if necessary
    min_max_scaler = MinMaxScaler()
    #d.ix[:,0:-1] = scale(d.ix[:,0:-1])
    d.ix[:,0:-1] = min_max_scaler.fit_transform(d.ix[:,0:-1])
    #d['quality'] = d['quality'].apply(lambda x:1.0 if x==6 else 0.0)
    d['quality'] = d['quality'].apply(lambda x: int(x) -3)
    d = d.iloc[np.random.permutation(len(d))]
    t = int(len(d) * TRAINING_RATE)
    tn_data = d.iloc[0:t,:]
    tt_data = d.iloc[t:,:]
    
    tn_X = tn_data.ix[:,0:-1]
    tn_Y = tn_data.ix[:,-1]
    tt_X = tt_data.ix[:,0:-1]
    tt_Y = tt_data.ix[:,-1]
    
    
    return tn_X,tn_Y, tt_X, tt_Y

In [9]:
def TestMissingGenerate(tt_data,mr=MISSING_RATE):
    missing_entry = []
    row , col= range(tt_data.shape[0]),range(tt_data.shape[1])
    
    while len(missing_entry) < tt_data.shape[0] * tt_data.shape[1] * mr:
        r = np.random.choice(row)
        c = np.random.choice(col)
        
        if (r,c) not  in missing_entry:
            missing_entry.append((r,c))
        else:
            continue
    
    for me in missing_entry:
        tt_data.set_value(me[0],tt_data.columns[me[1]],np.nan)
    return tt_data, missing_entry

In [10]:
def Imputation(d,missing):
    d = d.fillna(0.0)
    model = NMF(n_components=3, init='random', random_state=0)
    model.fit(d)
    H = model.components_
    W = model.fit_transform(d)
    d_prime = W.dot(H)
    
    d = d.values
    for m in missing:
        #d[m[0]][m[1]] = d_prime[m[0]][m[1]]
        d[m[0]][m[1]] = 0.0
    return d

In [11]:
tn_X,tn_Y,tt_X,tt_Y = preprocessing(data)

In [12]:
tn_X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
4731,0.144231,0.22549,0.228916,0.15184,0.065282,0.1777,0.303944,0.117602,0.563636,0.27907,0.596774
937,0.221154,0.27451,0.349398,0.220859,0.103858,0.139373,0.24594,0.206092,0.390909,0.337209,0.16129
1217,0.403846,0.519608,0.228916,0.17638,0.866469,0.076655,0.489559,0.235011,0.2,0.302326,0.193548
3296,0.269231,0.196078,0.253012,0.116564,0.103858,0.202091,0.433875,0.164064,0.381818,0.302326,0.225806
4524,0.269231,0.078431,0.150602,0.141104,0.118694,0.200348,0.296984,0.152111,0.4,0.186047,0.322581


In [13]:
tn_Y.value_counts()

3    1759
2    1155
4     702
5     152
1     132
0      13
6       5
Name: quality, dtype: int64

In [14]:
#tt_X.head()

In [15]:
#set(tn_X.index).intersection(set(tt_X.index))

In [16]:
tn_X = tn_X.reset_index(drop=True)
tt_X = tt_X.reset_index(drop=True)
tt_oracle = tt_X.copy()

In [17]:
#tt_oracle.head()

In [18]:
tt_X, missing_entry = TestMissingGenerate(tt_X)

In [19]:
tt_imp = Imputation(tt_X,missing_entry)

In [20]:
pd.DataFrame(tt_imp).head(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.0,0.0,0.0,0.080119,0.0,0.361949,0.086948,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.094077,0.0,0.0,0.354545,0.0,0.0
2,0.230769,0.0,0.0,0.039877,0.0,0.0,0.0,0.0,0.0,0.546512,0.0
3,0.211538,0.0,0.246988,0.0,0.0,0.184669,0.324826,0.0,0.0,0.430233,0.0
4,0.0,0.0,0.192771,0.124233,0.059347,0.101045,0.0,0.083864,0.254545,0.139535,0.693548
5,0.0,0.0,0.168675,0.084356,0.059347,0.034843,0.194896,0.066127,0.427273,0.0,0.774194
6,0.0,0.0,0.0,0.0,0.0,0.0,0.266821,0.0,0.0,0.0,0.403226
7,0.0,0.0,0.216867,0.023006,0.074184,0.0,0.0,0.072103,0.0,0.0,0.629032
8,0.259615,0.0,0.0,0.009202,0.094955,0.097561,0.0,0.104685,0.245455,0.0,0.225806
9,0.307692,0.127451,0.0,0.124233,0.106825,0.121951,0.0,0.159823,0.481818,0.372093,0.0


In [21]:
tt_oracle.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.288462,0.147059,0.289157,0.013804,0.080119,0.114983,0.361949,0.086948,0.418182,0.267442,0.532258
1,0.278846,0.098039,0.168675,0.147239,0.089021,0.094077,0.24594,0.146135,0.354545,0.267442,0.467742
2,0.230769,0.27451,0.162651,0.039877,0.068249,0.038328,0.148492,0.044534,0.163636,0.546512,0.741935
3,0.211538,0.205882,0.246988,0.156442,0.115727,0.184669,0.324826,0.127048,0.336364,0.430233,0.478495
4,0.288462,0.235294,0.192771,0.124233,0.059347,0.101045,0.222738,0.083864,0.254545,0.139535,0.693548
5,0.269231,0.27451,0.168675,0.084356,0.059347,0.034843,0.194896,0.066127,0.427273,0.05814,0.774194
6,0.221154,0.166667,0.108434,0.15184,0.118694,0.135889,0.266821,0.177174,0.381818,0.151163,0.403226
7,0.336538,0.166667,0.216867,0.023006,0.074184,0.097561,0.389791,0.072103,0.481818,0.209302,0.629032
8,0.259615,0.156863,0.114458,0.009202,0.094955,0.097561,0.236659,0.104685,0.245455,0.27907,0.225806
9,0.307692,0.127451,0.168675,0.124233,0.106825,0.121951,0.4942,0.159823,0.481818,0.372093,0.387097


In [22]:
clf = LogisticRegression(C=1)
clf.fit(tn_X,tn_Y)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
sum(clf.predict(tn_X) == tn_Y) / float(len(tn_Y))

0.52782031648800409

In [24]:
sum(clf.predict(tt_imp) == tt_Y) / float(len(tt_Y))

0.42244897959183675

In [25]:
def RandomQuery(m):
    q = m[np.random.randint(0,len(m))]
    return q

In [None]:
def NaiveHeuristic(model=clf,missing=missing_entry,data=tt_imp): # Active learning
    
    predict_prob = model.predict_proba(data)
    predict = model.predict(data)
    coef = model.coef_
    certainty = []
    for p in predict_prob:
        tmp = sorted(p,reverse=True)
        certainty.append(tmp[0]-tmp[1])
    q_r = np.argsort(certainty)

    for r in q_r: 
        weight = np.argsort(np.absolute(coef[predict[r]]))[::-1]
        for w in weight:
            if (r,w) in missing:
                return (r,w)    

In [None]:
query_num = 0
query_budget = tt_imp.shape[0] * tt_imp.shape[1] * MISSING_RATE * QUERY_RATE

res_naive = []
while query_num < query_budget:
    
    #Random query
    #query = RandomQuery(missing_entry)
    query = NaiveHeuristic()
    missing_entry.remove(query)
    
    tt_imp[query[0]][query[1]] = tt_oracle.ix[query[0],query[1]]
    
    if query_num % 100 ==0:
        #print "#of Query" + str(query_num) + " Query entry: "
        tmp_r = sum(clf.predict(tt_imp) == tt_Y)/ float(len(tt_imp))
        print(tmp_r)
        res_naive.append(tmp_r)
    query_num += 1

0.421428571429
0.424489795918
0.416326530612
0.419387755102
0.432653061224
0.438775510204


In [28]:
0.339795918367
0.334693877551
0.338775510204
0.338775510204
0.338775510204
0.336734693878
0.340816326531
0.34387755102
0.351020408163
0.35
0.345918367347
0.345918367347
0.348979591837
0.351020408163
0.354081632653
0.357142857143
0.359183673469
0.363265306122

0.363265306122

In [29]:
0.464285714286
0.463265306122
0.466326530612
0.469387755102
0.473469387755
0.478571428571
0.477551020408

0.477551020408

In [30]:
# cPickle.dump(res_naive,open("naive_30.pkl",'w'))

NameError: name 'cPickle' is not defined

In [31]:
#0.8 missing rate 0.3 query rate 
#of Query0 Query entry: 
0.339795918367
#of Query100 Query entry: 
0.334693877551
#of Query200 Query entry: 
0.338775510204
#of Query300 Query entry: 
0.338775510204
#of Query400 Query entry: 
0.338775510204
#of Query500 Query entry: 
0.336734693878
#of Query600 Query entry: 
0.340816326531
#of Query700 Query entry: 
0.34387755102
#of Query800 Query entry: 
0.351020408163
#of Query900 Query entry: 
0.35
#of Query1000 Query entry: 
0.345918367347
#of Query1100 Query entry: 
0.345918367347
#of Query1200 Query entry: 
0.348979591837
#of Query1300 Query entry: 
0.351020408163
#of Query1400 Query entry: 
0.354081632653
#of Query1500 Query entry: 
0.357142857143
#of Query1600 Query entry: 
0.359183673469
#of Query1700 Query entry: 
0.363265306122

0.363265306122