In [1]:
import gc
import time
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb

In [161]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split, cross_val_predict
from sklearn.metrics import mean_squared_log_error

from xgboost import XGBRegressor


In [3]:
train  = pd.read_csv('../MachineHackE-Commerce/Train.csv')
test = pd.read_csv('../MachineHackE-Commerce/Test.csv')
sub = pd.read_excel('../MachineHackE-Commerce/Sample_Submission.xlsx')


In [4]:
train_embedding = pd.read_csv('../MachineHackE-Commerce/train_embedding.csv')
train_topic = pd.read_csv('../MachineHackE-Commerce/train_topic.csv')
train_tfidf = pd.read_csv('../MachineHackE-Commerce/train_tfidf.csv')
train_enc = pd.read_csv('../MachineHackE-Commerce/train_enc.csv')

In [5]:
test_embedding = pd.read_csv('../MachineHackE-Commerce/test_embedding.csv')
test_topic = pd.read_csv('../MachineHackE-Commerce/test_topic.csv')
test_tfidf = pd.read_csv('../MachineHackE-Commerce/test_tfidf.csv')
test_enc = pd.read_csv('../MachineHackE-Commerce/test_enc.csv')

In [6]:
train_enc.columns

Index(['Product', 'Product_Brand', 'Item_Category', 'Subcategory_1',
       'Subcategory_2', 'Item_Rating', 'Selling_Price', 'Is_Holiday', 'Year',
       'Month', 'Week', 'Day', 'Dayofweek', 'Is_month_end', 'Is_month_start',
       'Is_quarter_end', 'Is_quarter_start'],
      dtype='object')

In [7]:
train_enc1 = train_enc.drop(['Product', 'Product_Brand', 'Item_Category', 'Subcategory_1',
       'Subcategory_2','Selling_Price'],axis=1)*1
test_enc1 = test_enc.drop(['Product', 'Product_Brand', 'Item_Category', 'Subcategory_1',
       'Subcategory_2'],axis=1)*1

In [8]:
def f_y(s):
    return s/1000

In [9]:
temptrain = f_y(train_enc1['Year'])
temptest = f_y(test_enc1['Year'])

In [10]:
train_enc1['Year'] = temptrain
test_enc1['Year'] = temptest

In [11]:
def f(s):
    return s/s.max()

In [12]:
train_enc1.columns

Index(['Item_Rating', 'Is_Holiday', 'Year', 'Month', 'Week', 'Day',
       'Dayofweek', 'Is_month_end', 'Is_month_start', 'Is_quarter_end',
       'Is_quarter_start'],
      dtype='object')

In [13]:
train_enc1['Month'] = train_enc1[['Month']].apply(f)
test_enc1['Month'] = test_enc1[['Month']].apply(f)


train_enc1['Week'] = train_enc1[['Week']].apply(f)
test_enc1['Week'] = test_enc1[['Week']].apply(f)

train_enc1['Day'] = train_enc1[['Day']].apply(f)
test_enc1['Day'] = test_enc1[['Day']].apply(f)

train_enc1['Dayofweek'] = train_enc1[['Dayofweek']].apply(f)
test_enc1['Dayofweek'] = test_enc1[['Dayofweek']].apply(f)


In [14]:
train_enc1.head()

Unnamed: 0,Item_Rating,Is_Holiday,Year,Month,Week,Day,Dayofweek,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start
0,4.3,0,2.017,0.166667,0.09434,0.096774,0.666667,0,0,0,0
1,3.1,0,2.015,0.583333,0.509434,0.032258,0.333333,0,1,0,1
2,3.5,1,2.019,0.083333,0.037736,0.387097,0.833333,0,0,0,0
3,4.0,0,2.014,1.0,0.943396,0.387097,0.666667,0,0,0,0
4,4.3,0,2.013,1.0,0.943396,0.387097,0.5,0,0,0,0


In [15]:
encoded = pd.concat([train_enc1,test_enc1]).values
encoded.shape

(3503, 11)

In [16]:
train_embedding.shape
embedding = pd.concat([train_embedding,test_embedding]).values
embedding.shape

(3503, 768)

In [17]:
train_topic.shape
topic = pd.concat([train_topic,test_topic]).values
topic.shape

(3503, 15)

In [18]:
train_tfidf.shape
tfidf = pd.concat([train_tfidf,test_tfidf]).values
tfidf.shape

(3503, 100)

In [19]:
NUM_BRANDS = 790
NUM_CATEGORIES = 1001
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 3

In [20]:
nrow_train = train.shape[0]
#y = np.log1p(train["Selling_Price"])
y = train["Selling_Price"]
dataset = pd.concat([train, test])
      

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [21]:
dataset['Item_Category'] = dataset['Item_Category'].astype('category')
dataset['Product_Brand'] = dataset['Product_Brand'].astype('category')
dataset['Subcategory_1'] = dataset['Subcategory_1'].astype('category')
dataset['Subcategory_2'] = dataset['Subcategory_2'].astype('category')

dataset['text'] = dataset.apply(lambda row: ' '.join ([str(row['Item_Category']), str(row['Subcategory_1']),str(row['Subcategory_2'])]), 
                                          axis=1)



In [22]:
cv = CountVectorizer(min_df=NAME_MIN_DF)
X_Item_Category = cv.fit_transform(dataset['Item_Category'])
    

In [23]:
cv = CountVectorizer()
X_category1 = cv.fit_transform(dataset['Subcategory_1'])
    

In [24]:
cv2 = CountVectorizer()
X_category2 = cv.fit_transform(dataset['Subcategory_2'])
    

In [25]:
tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                         ngram_range=(1, 3),
                         stop_words='english')
X_description = tv.fit_transform(dataset['text'])
    

In [26]:
lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(dataset['Product_Brand'])
    

In [27]:
dataset['Product'] = dataset['Product'].str.replace(r'\D', '')

dataset['Product'] = pd.to_numeric(dataset['Product'], errors='coerce')


In [28]:
X_product = f(dataset['Product'])

In [29]:
X_prod=np.array(X_product).reshape(-1,1)

In [30]:
X_description.shape
X_category1.shape
X_category2.shape
X_Item_Category.shape
X_prod.shape

(3503, 1)

In [31]:
sparse_merge = hstack((X_description, X_brand, X_category1,X_category2, X_Item_Category , X_prod)).tocsr()
X = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_train:]
   

In [32]:
sparse_merge1 = hstack((X_description, X_brand, X_category1,X_category2, X_Item_Category , X_prod,encoded)).tocsr()
X1 = sparse_merge1[:nrow_train]
X_test1 = sparse_merge1[nrow_train:]
   

In [33]:
sparse_merge2 = hstack((X_prod, X_brand,encoded,topic)).tocsr()
X2 = sparse_merge2[:nrow_train]
X_test2 = sparse_merge2[nrow_train:]
   

In [34]:
sparse_merge3 = hstack((X_prod, X_brand,encoded,embedding)).tocsr()
X3 = sparse_merge3[:nrow_train]
X_test3 = sparse_merge3[nrow_train:]
   

In [35]:
sparse_merge4 = hstack((X_prod, X_brand,encoded,tfidf)).tocsr()
X4 = sparse_merge4[:nrow_train]
X_test4 = sparse_merge4[nrow_train:]
   

In [36]:
sparse_merge5 = hstack((X_prod, X_brand,encoded,X_description)).tocsr()
X5 = sparse_merge5[:nrow_train]
X_test5 = sparse_merge5[nrow_train:]
   

In [37]:
sparse_merge6 = hstack((X_prod, X_brand,encoded,X_description,topic)).tocsr()
X6 = sparse_merge6[:nrow_train]
X_test6 = sparse_merge6[nrow_train:]
   

In [38]:
sparse_merge7 = hstack((X_description, X_brand, X_category1,X_category2, X_Item_Category , X_prod,encoded,topic)).tocsr()
X7 = sparse_merge7[:nrow_train]
X_test7 = sparse_merge7[nrow_train:]
   

In [39]:
def rmsle_cv(model,X,y):
    n_folds = 5
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X)
    rmsle= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_log_error", cv = kf))
    print(rmsle.mean())
    return(rmsle)

In [122]:
def rmsle_cv_ensemble(model,X,y):
    n_folds = 5
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X)
    
    y_pred = cross_val_predict(model, X, np.log1p(y), cv=kf)
    if (y_pred<0).sum() :
            print("negtive {0} value predicted".format( (y_pred<0).sum())) 
          
        
    y_pred_inverse = np.expm1(y_pred)
    
    
    score = np.sqrt(mean_squared_log_error(y, y_pred))
    
    return(score)

In [41]:
def rmsle(y, y_pred):
        return np.sqrt(mean_squared_log_error(y, y_pred))

In [42]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import SGDRegressor
from sklearn import linear_model

In [62]:
mlp = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam',
                   alpha=0.0001, batch_size='auto', learning_rate='constant', 
                   learning_rate_init=0.001, power_t=0.5, max_iter=1000, shuffle=True, 
                   random_state=42, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                   nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, 
                   beta_1=0.9, beta_2=0.999,
                   epsilon=1e-08, n_iter_no_change=10, max_fun=15000)

In [44]:
# ereg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)])
# ereg = ereg.fit(X, y)

In [45]:
par = PassiveAggressiveRegressor(max_iter=100, random_state=0,tol=1e-3) 

In [46]:
#  C=1.0, fit_intercept=True, max_iter=1000, tol=0.001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='epsilon_insensitive', epsilon=0.1, random_state=None, warm_start=False, average=False)

In [80]:
br = linear_model.BayesianRidge()
ridge = Ridge(solver="sag", fit_intercept=True, random_state=205)


In [48]:
sgd = SGDRegressor(loss='huber',
             penalty='l2', alpha=0.001, l1_ratio=0.15, fit_intercept=True, max_iter=5000, tol=0.001, shuffle=True, 
             verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, 
             early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False)

In [49]:
GBoost = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05,
                                   max_depth=8, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)


In [50]:
model_rf = RandomForestRegressor(random_state=2)


In [99]:
model_lgb= lgb.LGBMRegressor(learning_rate= 0.75,application='regression',max_depth= 3,num_leaves=100,
                             verbosity= -1,metric='RMSE')

In [127]:
rmsle_cv_ensemble(model_rf,X,y)

4.754042669809545

In [128]:
rmsle_cv_ensemble(model_rf,X1,y)

4.754482886361403

In [129]:
rmsle_cv_ensemble(model_rf,X2,y)

4.75422626168464

In [None]:
rmsle_cv_ensemble(model_rf,X3,y)

In [130]:
rmsle_cv_ensemble(model_rf,X4,y)

4.75443478526592

In [None]:
rmsle_cv_ensemble(model_rf,X5,y)

In [158]:
rmsle_cv_ensemble(mlp,X,y)

4.774995808719973

In [151]:
rmsle_cv_ensemble(mlp,X1,y)

4.751442688315458

In [152]:
rmsle_cv_ensemble(model_lgb,X2,y)

4.750648198174103

In [153]:
rmsle_cv_ensemble(model_lgb,X3,y)

4.751642637327099

In [154]:
rmsle_cv_ensemble(model_lgb,X4,y)

4.749330432611566

In [155]:
rmsle_cv_ensemble(model_lgb,X5,y)

4.75348754064334

In [156]:
rmsle_cv_ensemble(model_lgb,X6,y)

4.751096882793406

In [157]:
rmsle_cv_ensemble(model_lgb,X7,y)

4.750617662996885

In [162]:
et = ExtraTreesRegressor(random_state=2)

In [163]:
rmsle_cv_ensemble(et,X7,y)

4.755347689372286

In [164]:
xgb = XGBRegressor(random_state =2)

In [165]:
rmsle_cv_ensemble(xgb,X7,y)

  if getattr(data, 'base', None) is not None and \




4.752155487088109

In [None]:
model_rf.fit(train, np.log1p(y_train))
model_rf_train_pred = np.expm1(model_rf.predict(train))

#model_rf_pred = np.expm1(model_rf.predict(test))
print(rmsle(y_train, model_rf_train_pred))

In [None]:
(model_rf_train_pred<0).sum()

In [None]:
model_lgb.fit(train, np.log1p(y_train))
model_lgb_train_pred = np.expm1(model_lgb.predict(train))

#model_lgb_pred = np.expm1(model_lgb.predict(test))
print(rmsle(y_train, model_lgb_train_pred))

In [None]:
model_lgb_train_pred

In [None]:
GBoost.fit(train, np.log1p(y_train))
GBoost_train_pred = np.expm1(GBoost.predict(train))

#GBoost_pred = np.expm1(GBoost.predict(test))
print(rmsle(y_train, model_lgb_train_pred))

In [None]:

print('RMSLE score on train data:')
print(rmsle(y_train, model_rf_train_pred*0.40 + model_avg2_train_pred*0.60 ))

In [None]:
ensemble =  model_avg2_pred*0.60 + model_rf_pred*0.40

In [None]:
model1 = lgb.train(params, train_set=d_train, num_boost_round=3200, verbose_eval=100) 
model2 = Ridge(solver="sag", fit_intercept=True, random_state=205)


In [None]:
d_train = lgb.Dataset(X, label=y)
#d_valid = lgb.Dataset(valid_X, label=valid_y, max_bin=8192)
#watchlist = [d_train, d_valid]
    
params = {
        'learning_rate': 0.75,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 100,
        'verbosity': -1,
        'metric': 'RMSE',
}


#model = lgb.train(params, train_set=d_train, num_boost_round=3200, verbose_eval=100) 
#preds = 0.57*model.predict(X_test)


model = Ridge(solver="sag", fit_intercept=True, random_state=205)
model.fit(X, y)
 
preds =*model.predict(X=X_test)    
#preds += 0.43*model.predict(X=X_test)

In [None]:
sub['Selling_Price'] = np.expm1(preds)

sub.to_excel("submission_lgbm_ridge.xlsx", index=False)


https://mlwave.com/kaggle-ensembling-guide/

In [None]:
preds