# 模块导入

In [1]:
import pandas as pd
import numpy as np
import json
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
data = pd.read_csv("data/train_clean.csv")

# 连续值特征数据处理

In [3]:
def continuous_features(data):
	def normalize_feature(df):
		return df.apply(lambda column: (column - column.mean()) / column.std())

	continuous_train = data[["time", "resolution_ratio"]]
	continuous_train = normalize_feature(continuous_train)
	continuous_train["ip"] = data["ip"]
	continuous_train = np.array(continuous_train)

	return continuous_train

In [4]:
continuous_train = continuous_features(data)

In [5]:
continuous_train.shape

(1000000, 3)

# one-hot特征数据处理

In [6]:
def one_hot_features(data):
	def make_one_hot(feature, one_hot_data):
		try:
			n = len(index_json[feature])
		except:
			n = int(data[feature].max())

		return np_utils.to_categorical(one_hot_data, n + 1)

	with open("data/index_json.json", "r") as f:
		index_json = json.load(f)

	one_hot_set = {}
	for i in ["apptype", "dvctype", "ntt", "carrier", "orientation", "lan"]:
		one_hot_set[i] = make_one_hot(i, np.uint8(data[i].values.reshape(len(data[i]), 1)))

	one_hot_train = np.hstack(tuple(one_hot_set.values()))

	return one_hot_train

In [7]:
one_hot_train = one_hot_features(data)

In [8]:
one_hot_train.shape

(1000000, 111)

# embedding特征数据处理

In [9]:
def embedding_features(data):
	embedding_train = data[["pkgname", "adunitshowid", "mediashowid", "city", \
					"adidmd5", "imeimd5","openudidmd5", "macmd5", \
					"model", "osv"]]
	embedding_train = np.array(embedding_train)

	return embedding_train

In [10]:
embedding_train = embedding_features(data)

In [11]:
embedding_train.shape

(1000000, 10)

# 整合

In [16]:
Y_train = np.array(data["label"]).reshape(len(data["label"]), 1)

In [17]:
Y_train.shape

(1000000, 1)

In [14]:
unembedding_train = np.hstack((continuous_train, one_hot_train))

In [15]:
unembedding_train.shape

(1000000, 114)