In [2]:
import sys
sys.path.append('/Users/jinchen/Desktop/DSE/IntroML/Project/project-product-price-prediction')

In [3]:
import pandas as pd
import numpy as np
from final.model_evaluation.keras_model import neural_network
from sklearn.model_selection import train_test_split
from final.random_sampling.even_sample_brand import stratified_sampling_by_brand
from final.feature_extraction.text_vectorization import encode_categories,encode_string_column

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression # F-value between label/feature for regression tasks.
from final.helper.save_data import save_np_file

Using TensorFlow backend.


In [5]:
df = pd.read_csv("../../data/clean_data_with_text_features.csv")

In [6]:
Y = np.log1p(df['price'])
df.drop(['price'], axis=1, inplace=True)

train_df, test_df , y_train, y_test = train_test_split(df, Y, test_size=0.2, random_state=12342)
print('Train size: %s, Test size: %s'%(train_df.shape, test_df.shape))

Train size: (1184865, 45), Test size: (296217, 45)


# Feature Extraction

In [None]:
train_cat_features, test_cat_features, train_cat_features_name = encode_categories(train_df, test_df,
                                                                columns = ['c1','c2','c3','brand_name'],
                                                                min_df = 10, print_progress=True)
train_str_features, test_str_features, train_str_features_name = encode_string_column(train_df, test_df,
                                                                   columns=['clean_name', 'clean_description'],
                                                                   min_df=10, max_features=15000,
                                                                   print_progress=True)
other_columns = list(train_df.select_dtypes([np.number]).columns)
other_columns.remove('train_id')
train_other_features = train_df[other_columns].values
test_other_features = test_df[other_columns].values
all_train = np.hstack((train_cat_features, train_str_features, train_other_features))
all_test = np.hstack((test_cat_features, test_str_features, test_other_features))
print('Train features size: %s, Test features size: %s'%(all_train.shape,
                                                         all_test.shape))

Size of vectorization features of c1 is 13
Size of vectorization features of c2 is 144
Size of vectorization features of c3 is 780
Size of vectorization features of brand_name is 2002
Shape of train vectorization features of ['c1', 'c2', 'c3', 'brand_name'] is (1184865, 2939)
Shape of test vectorization features of ['c1', 'c2', 'c3', 'brand_name'] is (296217, 2939)
Size of vectorization features of clean_name is 15000
Size of vectorization features of clean_description is 15000
Shape of train vectorization features of ['clean_name', 'clean_description'] is (1184865, 30000)
Shape of test vectorization features of ['clean_name', 'clean_description'] is (296217, 30000)


In [None]:
del train_cat_features, train_str_features, train_other_features
del test_cat_features, test_str_features, test_other_features
del train_df, test_df

# Feature Selection

In [None]:
# select k best
skb = SelectKBest(f_regression, k=5000)
x_skb_select_train = skb.fit_transform(all_train, y_train)
x_skb_select_test = skb.transform(all_test)

In [None]:
directory = "../../data"
save_np_file(dir = directory, filename="select_k_best_train.npy", data=x_skb_select_train)
save_np_file(dir = directory, filename="select_k_best_test.npy", data=x_skb_select_test)
save_np_file(dir = directory, filename="y_train.npy", data=y_train)
save_np_file(dir = directory, filename="y_test.npy", data=y_test)

# Train NN Model

### With Select k Best Feature

In [None]:
model = neural_network(model_prefix="select_k_best")
model.fit(x_skb_select_train, y_train.values,n_epoch=30,epoch=1, bs=128)

In [None]:
model.model.summary()

In [None]:
skb_select_train_pred = model.predict(x_skb_select_train)
skb_select_test_pred = model.predict(x_skb_select_test)

In [None]:
model.evaluation(y_train, skb_select_train_pred, y_test, skb_select_test_pred, price_split=30)

## With all features

In [None]:
all_model = neural_network(model_prefix="all_data")
all_model.fit(all_train, y_train.values,n_epoch=30,epoch=1, bs=128)

In [None]:
all_train_pred = all_model.predict(all_train)
all_test_pred = all_model.predict(all_test)

In [None]:
all_model.evaluation(y_train, all_train_pred, y_test, all_test_pred, price_split=30)