In [1]:
import numpy as np 
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sea
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression,RANSACRegressor,ElasticNet
from sklearn.preprocessing import MinMaxScaler,StandardScaler,PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,BaggingRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.svm import SVR
from sklearn.cluster import KMeans,DBSCAN
import collections

In [4]:
## Loading file 
data = pd.read_excel('_1.xlsx')
data.columns = ['day','month','year','store_num','buy_freq','sell_freq','price']


In [5]:
# Extracting Train and Test Indices
test_indices = list(np.where(data['price']==u'\u061f')[0])
data_index = list(data.index)
train_indices = [i for i in data_index if i not in test_indices]
train_data = data.iloc[train_indices]
test_data = data .iloc[test_indices]

## Per Store ID Information

In [19]:
## Exraxting All Unique Store IDs
all_stores = np.unique(train_data['store_num'].values)

## Extracting data rows for each Store ID(as a look_up table(dictionary))
data_per_store = {}
for store in all_stores:
    data_per_store['{}'.format(store)] = train_data[train_data['store_num'] == store]

## Producing Statistics about data rows for each Store(We Want to test whether we can create 
## model per Store ID.)
store_data_length = []
for i in data_per_store.values():
    store_data_length.append(len(i))
print "Average : {}".format(np.mean(store_data_length))
print "Standard Deviation : {}".format(np.std(store_data_length))
print "Maximum Length : {}".format(np.max(store_data_length))
print "Minimum Length : {}".format(np.min(store_data_length))

Average : 111.479323308
Standard Deviation : 87.5949835974
Maximum Length : 274
Minimum Length : 1


## Clustering Stores 

In [59]:
common_list = []
cols_for_clustering = ['price_diff_sign','but_to_sell_sign']
new_data_per_store = {}
store_cluster_lookup_table = {}
for key in data_per_store.keys():
    if len(data_per_store[key]) > 3:
        sd = data_per_store[key]
        new_representation = pd.DataFrame(sd.index)
        buy_f = np.array([float(i) for i in sd['buy_freq'].values])
        sell_f = np.array([float(i) for i in sd['sell_freq'].values])
        new_representation['but_to_sell'] = buy_f / sell_f
        new_representation['price_diff'] = sd['price'].diff().values
        new_representation['price_diff_sign'] = np.sign(new_representation['price_diff'].dropna())
        signs = []
        for i in new_representation['but_to_sell'].values:
            if i < 1 :
                signs.append(-1)
            elif i > 1 :
                signs.append(1)
            else:
                signs.append(0)
        new_representation['but_to_sell_sign'] = signs
        del new_representation['but_to_sell']
        del new_representation['price_diff']
        new_data_per_store[key] = new_representation.dropna()
        db = DBSCAN()
        db.fit(new_data_per_store[key][cols_for_clustering])
        common_list.append(collections.Counter(db.labels_).most_common())
        store_cluster_lookup_table[key] = collections.Counter(db.labels_).most_common()[0][0]
    else:
        store_cluster_lookup_table[key] = -1000

In [60]:
collections.Counter(store_cluster_lookup_table.values()).most_common()

[(0, 208), (1, 99), (2, 83), (-1, 81), (3, 30), (-1000, 19), (4, 10), (5, 2)]

## Splitting Data According to Store Cluster value

In [7]:
cluster0 = []
cluster1 = []
cluster2 = []
cluster3 = []
cluster4 = []
cluster5 = []
cluster_minus_one =[]
cluster_tiny_stores = []
for i in range(len(train_data) - 1):
    if store_cluster_lookup_table[str(train_data.iloc[i]['store_num'])] == 0:
        cluster0.append(train_data.iloc[i])
    elif store_cluster_lookup_table[str(train_data.iloc[i]['store_num'])] == 1:
        cluster1.append(train_data.iloc[i])
    elif store_cluster_lookup_table[str(train_data.iloc[i]['store_num'])] == 2:
        cluster2.append(train_data.iloc[i])
    elif store_cluster_lookup_table[str(train_data.iloc[i]['store_num'])] == 3:
        cluster3.append(train_data.iloc[i])
    elif store_cluster_lookup_table[str(train_data.iloc[i]['store_num'])] == 4:
        cluster4.append(train_data.iloc[i])
    elif store_cluster_lookup_table[str(train_data.iloc[i]['store_num'])] == 5:
        cluster5.append(train_data.iloc[i])
    elif store_cluster_lookup_table[str(train_data.iloc[i]['store_num'])] == -1:
        cluster_minus_one.append(train_data.iloc[i])
    elif store_cluster_lookup_table[str(train_data.iloc[i]['store_num'])] == -1000:
        cluster_tiny_stores.append(train_data.iloc[i])

In [8]:
cluster0 = pd.DataFrame(cluster0)
cluster1 = pd.DataFrame(cluster1)
cluster2 = pd.DataFrame(cluster2)
cluster3 = pd.DataFrame(cluster3)
cluster4 = pd.DataFrame(cluster4)
cluster5 = pd.DataFrame(cluster5)
cluster_minus_one = pd.DataFrame(cluster_minus_one)
cluster_tiny_stores = pd.DataFrame(cluster_tiny_stores)
clustered_data = [cluster0,cluster1,cluster2,cluster3,cluster4,cluster5,cluster_minus_one,cluster_tiny_stores]

In [21]:
## Statistics for SVR with rbf kernel
## Random Forest Regression
idx = [4,5] 
X = cluster2.iloc[:,idx].values
y = cluster2.iloc[:,6].values
scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)
y = scaler.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)



In [22]:
from sklearn.grid_search import GridSearchCV
param_grid = {'learning_rate' : [0.1,0.05,0.02,0.01],
              'max_depth' : [4,6],
              'min_samples_leaf' : [3,5,9,17],
              'max_features' : [1.0,0.3,0.1]}

In [23]:
params = {'n_estimators':1000,'learning_rate': 0.05,
 'max_depth': 4,
 'max_features': 0.1,
 'min_samples_leaf': 5}
est = GradientBoostingRegressor(**params)
est.fit(X_train,y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)

In [24]:
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),
                                       mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),
                                       r2_score(y_test, y_test_pred)))
np.mean(np.abs(y_test - y_test_pred) / y_test)

MSE train: 0.775, test: 0.942
R^2 train: 0.226, test: 0.050


0.38226713440180426

In [88]:
%matplotlib
# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(lr.staged_predict(X_test)):
    test_score[i] = lr.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, lr.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')

Using matplotlib backend: Qt4Agg


<matplotlib.text.Text at 0x7f7db004a610>

In [94]:
%matplotlib
feature_importance = lr.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, idx)
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

Using matplotlib backend: Qt4Agg


## Regression Based on Only information from stores itself

In [44]:
store_standalone_data = data_per_store[data_per_store.keys()[3]]
idx = [4,5] 
X = store_standalone_data.iloc[:,idx].values
y = store_standalone_data.iloc[:,6].values

In [45]:
## Random Forest Regression
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=1)
forest = RandomForestRegressor(n_estimators=1000,criterion='mse',random_state=1,n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

In [46]:
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),
                                       mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),
                                       r2_score(y_test, y_test_pred)))

MSE train: 29177650.595, test: 91680035.901
R^2 train: 0.800, test: 0.395


## Explotary Data Analysis

In [17]:
## Visiualizing Data Field to see whether there is some kind of linear or non-linear relationship 
## between them
import seaborn as sea
sea.set(style='whitegrid',context='notebook')
## Extracting Colums ['store_num', 'buy_freq', 'sell_freq', 'price']
cols = list(data.columns)[3:]
sea.pairplot(train_data[cols],size=2.5)
plt.show()

In [18]:
'''

The Pearson correlation coefficient measures the linear relationship between two datasets. 
Strictly speaking,Pearson's correlation requires that each dataset be normally distributed.
Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation.
Correlations of -1 or +1 imply an exact linear relationship.Positive correlations imply that as x 
increases, so does y. Negative correlations imply that as x increases, y decreases.
The p-value roughly indicates the probability of an uncorrelated system producing datasets
that have a Pearson correlation at least as extreme as the one computed from these datasets. 
The p-values are not entirely reliable but
are probably reasonable for datasets larger than 500 or so

'''

print "Pearson Correlation between price and other Features"
print "Sell_freq :" + str(pearsonr(train_data['sell_freq'],train_data['price']))
print "Buy_freq :" + str(pearsonr(train_data['buy_freq'],train_data['price']))
print "Store Number :" + str(pearsonr(train_data['store_num'],train_data['price']))
print "Day :" + str(pearsonr(train_data['day'],train_data['price']))
print "Month :" + str(pearsonr(train_data['month'],train_data['price']))
print "Year :" + str(pearsonr(train_data['year'],train_data['price']))

Pearson Correlation between price and other Features
Sell_freq :(0.0077505875271176661, 0.059094267647188836)
Buy_freq :(0.0025150212459648037, 0.54022558300946044)
Store Number :(0.044879277845874699, 7.8629068029899904e-28)
Day :(0.0031868214706579487, 0.43770579269211296)
Month :(0.084846501929298587, 3.4933203887432922e-95)
Year :(0.44619258678578994, 0.0)


In [19]:
## Normalizing Buy and Sell features(Just Two of them)
'''
x_std_scaler = StandardScaler()
y_std_scaler = StandardScaler()
X = x_std_scaler.fit_transform(train_data[train_data.columns[4:6]].values)
y = y_std_scaler.fit_transform(train_data['price'].values)
'''
## Normalizing Buy and Sell features between zero and one
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(train_data[train_data.columns[4:6]].values)
y = scaler.fit_transform(train_data['price'].values)
X = train_data[train_data.columns[4:6]].values
y = train_data['price'].values



In [22]:
X.shape

(59307, 2)

In [23]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

lr = LinearRegression()
lr.fit(X_train,y_train)
lr.predict(train_data[train_data.columns[4:6]].head().values)

quadratic = PolynomialFeatures(degree=2)
cubic = PolynomialFeatures(degree=3)
X_quad = quadratic.fit_transform(X_train)
X_cubic = cubic.fit_transform(X_train)
lr.fit(X_quad,y_train)
lr.fit(X_cubic,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
## Anti-Noise Linear Regression
ransac = RANSACRegressor(LinearRegression(),max_trials=100,min_samples=50,
                        residual_metric=lambda x: np.sum(np.abs(x), axis=1),
                        residual_threshold=5.0,
                        random_state=0)
ransac.fit(X_train,y_train)
y_train_pred = ransac.predict(X_train)
y_test_pred = ransac.predict(X_test)

In [25]:
from sklearn.metrics import mean_squared_error
print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), 
                                       mean_squared_error(y_test, y_test_pred)))
from sklearn.metrics import r2_score
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

MSE train: 958474444351.260, test: 316541156.230
R^2 train: -4729.600, test: -0.539


## Regression on whole data(Random Forest)

In [54]:
cols = ['buy_freq','sell_freq']
benchmark = test_data[(test_data['day']== 9) & (test_data['month'] == 2) & (test_data['year'] == 2014)][cols].values

In [47]:
## Random Forest Regression
idx = [4,5] 
X = train_data.iloc[:,idx].values
y = train_data.iloc[:,6].values
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=1)
forest = RandomForestRegressor(n_estimators=1000,criterion='mse',random_state=1,n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

In [48]:
#Statistics for RandomForest Regressor
print('MSE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred),
                                       mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),
                                       r2_score(y_test, y_test_pred)))
print np.mean(np.abs(y_test - y_test_pred) / y_test)

MSE train: 5695.854, test: 12425.378
R^2 train: 0.735, test: -0.115
0.0333183488484


In [55]:
pred = forest.predict(benchmark)

In [56]:
np.median(pred)

373260.52799999999

In [58]:
train_data[(train_data['day']== 10) & (train_data['month'] == 2) & (train_data['year'] == 2014)]

Unnamed: 0,day,month,year,store_num,buy_freq,sell_freq,price
54175,10,2,2014,134455,875500,515000,379080
54176,10,2,2014,137415,412000,51500,379080
54177,10,2,2014,135725,10300000,5963700,379080
54178,10,2,2014,136655,20661800,103000,379080
54179,10,2,2014,139075,30900,87550,379080
54180,10,2,2014,133585,30900,515000,379080
54181,10,2,2014,134745,566500,861595,379080
54182,10,2,2014,135385,61800,227012,379080
54183,10,2,2014,136065,5994600,7899070,379080
54184,10,2,2014,146675,93031145,93031145,379080


## Gradient Boosting Classifiers

In [49]:
from sklearn.ensemble import GradientBoostingRegressor

In [64]:
boost_clf = GradientBoostingRegressor(n_estimators=1000)

In [51]:
## Statistics for SVR with rbf kernel
## Random Forest Regression
idx = [4,5] 
X = train_data.iloc[:,idx].values
y = train_data.iloc[:,6].values
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)
y = scaler.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=1)



In [65]:
boost_clf.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=3, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [None]:
import sys