In [28]:
# Programming Assignment #3 
# Hanyu Feng 
# Student ID:452106
# T81-558: Application of Deep Learning
import os
import sklearn
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import KFold
from scipy.stats import zscore
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import tensorflow.contrib.learn as learn
from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
#from sklearn.model_selection import KFold
import shutil
import time
ENCODING = 'utf-8'

path = "./assignment3/data"

# Encode a text field to dummy variables
def encode_text_dummy(df,name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name,x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

# Encode a text field to a single index value
def encode_text_index(df,name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Encode a numeric field to Z-Scores
def encode_numeric_zscore(df,name,mean=None,sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    if sd ==0:
        df[name] = df[name]
    else :
        df[name] = (df[name]-mean)/sd
    


# Encode a numeric field to fill missing values with the median.
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

# Convert a dataframe to x/y suitable for training.
def to_xy(df,target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    return df.as_matrix(result),df[target]



def get_model_dir(name,erase):
    base_path = os.path.join("./assignment3","dnn")
    model_dir = os.path.join(base_path,name)
    os.makedirs(model_dir,exist_ok=True)
    if erase and len(model_dir)>4 and os.path.isdir(model_dir):
        shutil.rmtree(model_dir,ignore_errors=True) # be careful, this deletes everything below the specified path
    return model_dir

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))

def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [None]:
print("***Assignment3***")
tf.logging.set_verbosity(tf.logging.ERROR)
# Read the data



filename_train = os.path.join(path,"train.csv")
filename_trainencode = os.path.join(path,"encodetrain.csv")
filename_test = os.path.join(path,"test.csv")
df_train = pd.read_csv(filename_train,na_values=['NA','?'])


tempid = df_train['id']
temptitle = df_train['title']
templen = df_train['len']
df_train.drop('id',axis=1,inplace=True)
df_train.drop('title',axis=1,inplace=True)

encode_numeric_zscore(df_train,'len')
#encode_numeric_zscore(df_train,'cite_arXiv')
#encode_numeric_zscore(df_train,'cite_av_media')
#encode_numeric_zscore(df_train,'cite_book')
#encode_numeric_zscore(df_train,'cite_comic')
#encode_numeric_zscore(df_train,'cite_conference')
#encode_numeric_zscore(df_train,'cite_encyclopedia')
##encode_numeric_zscore(df_train,'cite_episode')
#encode_numeric_zscore(df_train,'Cite_govtrack')
#encode_numeric_zscore(df_train,'cite_journal')
#encode_numeric_zscore(df_train,'cite_magazine')
#encode_numeric_zscore(df_train,'cite_press_release')
encode_numeric_zscore(df_train,'cite_web')
encode_numeric_zscore(df_train,'links')
#encode_numeric_zscore(df_train,'tables')
encode_numeric_zscore(df_train,'files')
#encode_numeric_zscore(df_train,'math')

df_trainencode = df_train
df_trainencode.to_csv(filename_trainencode, index=False)
#df_train.drop('len',axis=1,inplace=True)

classnum = encode_text_index(df_train,'class')
num_classes = len(classnum)

#print(df_train)

# Preprocess the data
#df.to_csv(filename_write,index=False)

#
#print(df)

# Split the data
x,y = to_xy(df_train,'class')

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.25)

# Get/clear a directory to store the neural network to
model_dir = get_model_dir('Wiki',True)

# Choose an optimizer
opt=tf.train.AdamOptimizer(learning_rate=0.1)

# Create a deep neural network with 3 hidden layers
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=x.shape[0])]
classifier = learn.DNNClassifier(
    model_dir= model_dir, 
    optimizer=opt,
    dropout = 0.01,
    config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1),
    hidden_units=[8,16,4], n_classes=num_classes, feature_columns=feature_columns)



# Early stopping
validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
    x_test,
    y_test,
    every_n_steps=50,
    early_stopping_metric="loss",
    early_stopping_metric_minimize=True,
    early_stopping_rounds=500)

start_time = time.time()

# Fit/train neural network
classifier.fit(x_train, y_train,monitors=[validation_monitor],  steps=5000)# monitors=[validation_monitor]

elapsed_time = time.time() - start_time
print("Elapsed time: {}".format(hms_string(elapsed_time)))

print("Best step: {}, Last successful step: {}".format(
validation_monitor.best_step,validation_monitor._last_successful_step))

# Don't display numpy in scientific notation
np.set_printoptions(precision=4)
np.set_printoptions(suppress=True)
pred = list(classifier.predict(x_test, as_iterable=True))
score1 = metrics.accuracy_score(y_test, pred)
print("Accuracy score: {}".format(score1))

pred = list(classifier.predict_proba(x_test, as_iterable=True))
score2 = metrics.log_loss(y_test,pred)
print("Log loss score: {}".format(score2))
score = score1/score2
print("Total score: {}".format('%.3f'%score))

***Assignment3***


In [36]:
#from sklearn import metrics
#filename_test = os.path.join(path,"test.csv")
filename_submit = os.path.join(path,str('%.3f'%score)+"submit.csv")
df_test = pd.read_csv(filename_test,na_values=['NA','?'])

tempid = df_test['id']
temptitle = df_test['title']


encode_numeric_zscore(df_test,'len')
#encode_numeric_zscore(df_test,'cite_arXiv')
#encode_numeric_zscore(df_test,'cite_av_media')
#encode_numeric_zscore(df_test,'cite_book')
#encode_numeric_zscore(df_test,'cite_comic')
#encode_numeric_zscore(df_test,'cite_conference')
#encode_numeric_zscore(df_test,'cite_encyclopedia')
#encode_numeric_zscore(df_test,'cite_episode')
#encode_numeric_zscore(df_test,'Cite_govtrack')
#encode_numeric_zscore(df_test,'cite_journal')
#encode_numeric_zscore(df_test,'cite_magazine')
#encode_numeric_zscore(df_test,'cite_press_release')
encode_numeric_zscore(df_test,'cite_web')
encode_numeric_zscore(df_test,'links')
#encode_numeric_zscore(df_test,'tables')
encode_numeric_zscore(df_test,'files')
#encode_numeric_zscore(df_test,'math')

#print(df_test)

df_test.drop('id',axis=1,inplace=True)
df_test.drop('title',axis=1,inplace=True)
#df_test.drop('len',axis=1,inplace=True)

x = df_test.as_matrix().astype(np.float32)

# Generate predictions
pred = list(classifier.predict_proba(x, as_iterable=True))

df_submit = pd.DataFrame(pred)
df_submit.insert(0,'id',tempid)
df_submit.columns = ['id','class-0','class-1','class-2','class-3','class-4']
#df_submit.insert(6,'class-5',0)
df_submit.to_csv(filename_submit, index=False)
print(df_submit)

          id   class-0       class-1       class-2       class-3       class-4
0       6639  0.630670  2.183953e-02  2.722955e-03  3.334488e-01  1.131922e-02
1       9603  0.052670  6.129435e-01  3.343655e-01  2.111747e-05  3.926943e-11
2      12234  0.746944  1.472519e-01  2.143760e-02  8.406988e-02  2.963127e-04
3      16535  0.013201  4.400193e-01  5.467795e-01  4.489084e-07  3.859145e-14
4      18157  0.982462  3.048287e-04  9.600405e-04  1.622228e-02  5.082044e-05
5      33302  0.007809  3.742804e-01  6.179103e-01  1.133390e-07  3.372657e-15
6      37190  0.149001  6.746165e-01  1.758357e-01  5.464346e-04  1.601994e-08
7      43051  0.208046  6.599697e-01  1.302272e-01  1.756926e-03  1.456208e-07
8      43373  0.002893  2.660013e-01  7.311057e-01  9.113484e-09  4.042695e-17
9      51487  0.342525  5.755535e-01  6.908134e-02  1.283311e-02  6.807683e-06
10     54573  0.175047  6.706213e-01  1.533815e-01  9.497156e-04  4.535687e-08
11     87367  0.378322  4.578809e-01  1.619506e-01  

In [25]:
'''
encode_numeric_zscore(df_train,'len')
#encode_numeric_zscore(df_train,'cite_arXiv')
encode_numeric_zscore(df_train,'cite_av_media')
encode_numeric_zscore(df_train,'cite_book')
#encode_numeric_zscore(df_train,'cite_comic')
encode_numeric_zscore(df_train,'cite_conference')
encode_numeric_zscore(df_train,'cite_encyclopedia')
encode_numeric_zscore(df_train,'cite_episode')
#encode_numeric_zscore(df_train,'Cite_govtrack')
encode_numeric_zscore(df_train,'cite_journal')
encode_numeric_zscore(df_train,'cite_magazine')
encode_numeric_zscore(df_train,'cite_press_release')
encode_numeric_zscore(df_train,'cite_web')
encode_numeric_zscore(df_train,'links')
encode_numeric_zscore(df_train,'tables')
encode_numeric_zscore(df_train,'files')
encode_numeric_zscore(df_train,'math')
'''



'''
encode_numeric_zscore(df_test,'len')
#encode_numeric_zscore(df_test,'cite_arXiv')
encode_numeric_zscore(df_test,'cite_av_media')
encode_numeric_zscore(df_test,'cite_book')
#encode_numeric_zscore(df_test,'cite_comic')
encode_numeric_zscore(df_test,'cite_conference')
encode_numeric_zscore(df_test,'cite_encyclopedia')
encode_numeric_zscore(df_test,'cite_episode')
#encode_numeric_zscore(df_test,'Cite_govtrack')
encode_numeric_zscore(df_test,'cite_journal')
encode_numeric_zscore(df_test,'cite_magazine')
encode_numeric_zscore(df_test,'cite_press_release')
encode_numeric_zscore(df_test,'cite_web')
encode_numeric_zscore(df_test,'links')
encode_numeric_zscore(df_test,'tables')
encode_numeric_zscore(df_test,'files')
encode_numeric_zscore(df_test,'math')
'''

"\nencode_numeric_zscore(df_test,'len')\n#encode_numeric_zscore(df_test,'cite_arXiv')\nencode_numeric_zscore(df_test,'cite_av_media')\nencode_numeric_zscore(df_test,'cite_book')\n#encode_numeric_zscore(df_test,'cite_comic')\nencode_numeric_zscore(df_test,'cite_conference')\nencode_numeric_zscore(df_test,'cite_encyclopedia')\nencode_numeric_zscore(df_test,'cite_episode')\n#encode_numeric_zscore(df_test,'Cite_govtrack')\nencode_numeric_zscore(df_test,'cite_journal')\nencode_numeric_zscore(df_test,'cite_magazine')\nencode_numeric_zscore(df_test,'cite_press_release')\nencode_numeric_zscore(df_test,'cite_web')\nencode_numeric_zscore(df_test,'links')\nencode_numeric_zscore(df_test,'tables')\nencode_numeric_zscore(df_test,'files')\nencode_numeric_zscore(df_test,'math')\n"