In [29]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler,scale
import cPickle

In [30]:
TRAINING_RATE = 0.5
TESTING_RATE = 1 - TRAINING_RATE
MISSING_RATE = 0.3
QUERY_RATE = 0.2
np.random.seed(10)

In [31]:
data = pd.read_csv("../dat/winequality.csv",sep=';')

In [32]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [33]:
data['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [34]:
def preprocessing(d):
    #Add normalization code here if necessary
    min_max_scaler = MinMaxScaler()
    #d.ix[:,0:-1] = scale(d.ix[:,0:-1])
    d.ix[:,0:-1] = min_max_scaler.fit_transform(d.ix[:,0:-1])
    #d['quality'] = d['quality'].apply(lambda x:1.0 if x==6 else 0.0)
    d['quality'] = d['quality'].apply(lambda x: int(x) -3)
    d = d.iloc[np.random.permutation(len(d))]
    t = int(len(d) * TRAINING_RATE)
    tn_data = d.iloc[0:t,:]
    tt_data = d.iloc[t:,:]
    
    tn_X = tn_data.ix[:,0:-1]
    tn_Y = tn_data.ix[:,-1]
    tt_X = tt_data.ix[:,0:-1]
    tt_Y = tt_data.ix[:,-1]
    
    
    return tn_X,tn_Y, tt_X, tt_Y

In [35]:
def TestMissingGenerate(tt_data,mr=MISSING_RATE):
    missing_entry = []
    row , col= range(tt_data.shape[0]),range(tt_data.shape[1])
    
    while len(missing_entry) < tt_data.shape[0] * tt_data.shape[1] * mr:
        r = np.random.choice(row)
        c = np.random.choice(col)
        
        if (r,c) not  in missing_entry:
            missing_entry.append((r,c))
        else:
            continue
    
    for me in missing_entry:
        tt_data.set_value(me[0],tt_data.columns[me[1]],np.nan)
    return tt_data, missing_entry

In [43]:
def Imputation(d,missing):
    d = d.fillna(0.0)
    model = NMF(n_components=3, init='random', random_state=0)
    model.fit(d)
    H = model.components_
    W = model.fit_transform(d)
    d_prime = W.dot(H)
    
#     d = d.values
#     for m in missing:
#         d[m[0]][m[1]] = d_prime[m[0]][m[1]]
#         #d[m[0]][m[1]] = 0.0
    return d_prime

In [37]:
tn_X,tn_Y,tt_X,tt_Y = preprocessing(data)

In [38]:
tt_X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
18,0.346154,0.254902,0.253012,0.007669,0.071217,0.052265,0.37587,0.08849,0.363636,0.360465,0.532258
1602,0.173077,0.303922,0.144578,0.062883,0.074184,0.087108,0.157773,0.067284,0.509091,0.162791,0.758065
2291,0.461538,0.088235,0.168675,0.032209,0.11276,0.125436,0.327146,0.126084,0.345455,0.395349,0.451613
1508,0.288462,0.127451,0.295181,0.21319,0.151335,0.167247,0.37355,0.231155,0.754545,0.255814,0.290323
3798,0.394231,0.127451,0.23494,0.021472,0.142433,0.066202,0.299304,0.089647,0.3,0.348837,0.467742


In [None]:
#set(tn_X.index).intersection(set(tt_X.index))

In [39]:
tn_X = tn_X.reset_index(drop=True)
tt_X = tt_X.reset_index(drop=True)
tt_oracle = tt_X.copy()

In [12]:
tt_oracle.to_csv("/Users/chenchengkuan/Desktop/Movie-Rating-Prediction-Autoencoder/Data/wine_zeroone_oracle.csv",index=False,header=None)

In [None]:
#tt_oracle.head()

In [40]:
tt_X, missing_entry = TestMissingGenerate(tt_X)

In [41]:
tt_X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.346154,0.254902,0.253012,,0.071217,0.052265,,,0.363636,0.360465,0.532258
1,0.173077,0.303922,,0.062883,0.074184,0.087108,0.157773,,0.509091,0.162791,
2,0.461538,,,0.032209,,0.125436,0.327146,,0.345455,0.395349,0.451613
3,0.288462,,0.295181,0.21319,,0.167247,,,,0.255814,
4,0.394231,,0.23494,0.021472,0.142433,0.066202,,,,0.348837,


In [42]:
tt_X =  tt_X.fillna(0)

In [54]:
tt_X.to_csv("/Users/chenchengkuan/Desktop/Movie-Rating-Prediction-Autoencoder/Data/wine_zeroone.csv",index=False,header=None)

In [None]:
pd.read_csv("/Users/chenchengkuan/Desktop/Movie-Rating-Prediction-Autoencoder/Data/wine_zc.csv",header=None)

In [44]:
tt_imp = Imputation(tt_X,missing_entry)

In [47]:
s = np.array([[1,2],[3,4]])

In [48]:
s ** 2

array([[ 1,  4],
       [ 9, 16]])

In [55]:
tt_oracle

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.346154,0.254902,0.253012,0.007669,0.071217,0.052265,0.375870,0.088490,0.363636,0.360465,0.532258
1,0.173077,0.303922,0.144578,0.062883,0.074184,0.087108,0.157773,0.067284,0.509091,0.162791,0.758065
2,0.461538,0.088235,0.168675,0.032209,0.112760,0.125436,0.327146,0.126084,0.345455,0.395349,0.451613
3,0.288462,0.127451,0.295181,0.213190,0.151335,0.167247,0.373550,0.231155,0.754545,0.255814,0.290323
4,0.394231,0.127451,0.234940,0.021472,0.142433,0.066202,0.299304,0.089647,0.300000,0.348837,0.467742
5,0.250000,0.137255,0.192771,0.174847,0.169139,0.191638,0.345708,0.233083,0.800000,0.244186,0.161290
6,0.278846,0.421569,0.144578,0.023006,0.100890,0.041812,0.338747,0.063428,0.454545,0.441860,0.806452
7,0.173077,0.102941,0.114458,0.099693,0.115727,0.118467,0.234339,0.140158,0.490909,0.220930,0.241935
8,0.298077,0.107843,0.198795,0.015337,0.100890,0.212544,0.324826,0.103914,0.654545,0.348837,0.451613
9,0.365385,0.235294,0.210843,0.015337,0.246291,0.076655,0.299304,0.140158,0.427273,0.255814,0.290323


In [52]:
np.sum(((tt_imp - tt_oracle).values) ** 2)

540.42808169939417

In [23]:
tt_X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.346154,0.254902,0.253012,0.0,0.071217,0.052265,0.0,0.0,0.363636,0.360465,0.532258
1,0.173077,0.303922,0.0,0.062883,0.074184,0.087108,0.157773,0.0,0.509091,0.162791,0.0
2,0.461538,0.0,0.0,0.032209,0.0,0.125436,0.327146,0.0,0.345455,0.395349,0.451613
3,0.288462,0.0,0.295181,0.21319,0.0,0.167247,0.0,0.0,0.0,0.255814,0.0
4,0.394231,0.0,0.23494,0.021472,0.142433,0.066202,0.0,0.0,0.0,0.348837,0.0


In [24]:
tt_imp

array([[ 0.34615385,  0.25490196,  0.25301205, ...,  0.36363636,
         0.36046512,  0.53225806],
       [ 0.17307692,  0.30392157,  0.11061748, ...,  0.50909091,
         0.1627907 ,  0.01526215],
       [ 0.46153846,  0.18414557,  0.18627683, ...,  0.34545455,
         0.39534884,  0.4516129 ],
       ..., 
       [ 0.33653846,  0.14705882,  0.22289157, ...,  0.01859166,
         0.25709574,  0.67741935],
       [ 0.13062929,  0.31372549,  0.12948831, ...,  0.58181818,
         0.10109211,  0.74193548],
       [ 0.1464656 ,  0.07843137,  0.18072289, ...,  0.6       ,
         0.25581395,  0.51612903]])

In [None]:
tt_oracle.head(10)

In [None]:
clf = LogisticRegression(C=1)
clf.fit(tn_X,tn_Y)

In [None]:
sum(clf.predict(tn_X) == tn_Y) / float(len(tn_Y))

In [None]:
sum(clf.predict(tt_imp) == tt_Y) / float(len(tt_Y))

In [None]:
def RandomQuery(m):
    q = m[np.random.randint(0,len(m))]
    return q

In [None]:
def NaiveHeuristic(model=clf,missing=missing_entry,data=tt_imp):
    
    predict_prob = model.predict_proba(data)
    predict = model.predict(data)
    coef = model.coef_
    certainty = []
    for p in predict_prob:
        tmp = sorted(p,reverse=True)
        certainty.append(tmp[0]-tmp[1])
    q_r = np.argsort(certainty)

    for r in q_r: 
        weight = np.argsort(np.absolute(coef[predict[r]]))[::-1]
        for w in weight:
            if (r,w) in missing:
                return (r,w)    

In [None]:
query_num = 0
query_budget = tt_imp.shape[0] * tt_imp.shape[1] * MISSING_RATE * QUERY_RATE

res_naive = []
while query_num < query_budget:
    
    #Random query
    #query = RandomQuery(missing_entry)
    query = NaiveHeuristic()
    missing_entry.remove(query)
    
    tt_imp[query[0]][query[1]] = tt_oracle.ix[query[0],query[1]]
    
    if query_num % 100 ==0:
        #print "#of Query" + str(query_num) + " Query entry: "
        tmp_r = sum(clf.predict(tt_imp) == tt_Y)/ float(len(tt_imp))
        print tmp_r
        res_naive.append(tmp_r)
    query_num += 1

In [None]:
0.339795918367
0.334693877551
0.338775510204
0.338775510204
0.338775510204
0.336734693878
0.340816326531
0.34387755102
0.351020408163
0.35
0.345918367347
0.345918367347
0.348979591837
0.351020408163
0.354081632653
0.357142857143
0.359183673469
0.363265306122

In [None]:
0.464285714286
0.463265306122
0.466326530612
0.469387755102
0.473469387755
0.478571428571
0.477551020408

In [None]:
cPickle.dump(res_naive,open("naive_30.pkl",'w'))

In [None]:
#0.8 missing rate 0.3 query rate 
#of Query0 Query entry: 
0.339795918367
#of Query100 Query entry: 
0.334693877551
#of Query200 Query entry: 
0.338775510204
#of Query300 Query entry: 
0.338775510204
#of Query400 Query entry: 
0.338775510204
#of Query500 Query entry: 
0.336734693878
#of Query600 Query entry: 
0.340816326531
#of Query700 Query entry: 
0.34387755102
#of Query800 Query entry: 
0.351020408163
#of Query900 Query entry: 
0.35
#of Query1000 Query entry: 
0.345918367347
#of Query1100 Query entry: 
0.345918367347
#of Query1200 Query entry: 
0.348979591837
#of Query1300 Query entry: 
0.351020408163
#of Query1400 Query entry: 
0.354081632653
#of Query1500 Query entry: 
0.357142857143
#of Query1600 Query entry: 
0.359183673469
#of Query1700 Query entry: 
0.363265306122