In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train.csv')
print(df.shape)
df.head(5)

(6700, 16)


Unnamed: 0,id,departure city,discount,price,no of items,location,class,segment,sub-class,delivery type,RID,profit,delivery date,address code,departure state,placement date
0,1,Houston,0.2,16.448,2,Central,kariox,Consumer,phone,Standard Class,7981,5.5512,2018-01-01,77095,Texas,2018-01-01
1,2,Westland,0.0,29.7,5,Central,kariox,Consumer,headset,Standard Class,6334,13.365,2018-01-01,48185,Michigan,2018-01-01
2,3,Westland,0.0,14.73,3,Central,qexty,Consumer,shorts,Standard Class,6333,4.8609,2018-01-01,48185,Michigan,2018-01-01
3,4,Westland,0.0,43.92,3,Central,kariox,Consumer,television,Standard Class,6332,12.7368,2018-01-01,48185,Michigan,2018-01-01
4,5,Westland,0.0,66.58,2,Central,kariox,Consumer,laptop,Standard Class,6331,15.9792,2018-01-01,48185,Michigan,2018-01-01


# preprocessing

In [3]:
def preprocessing(df):
    
    dff = df[['discount', 'no of items', 'price', 'sub-class', 'delivery type', 'profit']]
    
    dff.dropna(inplace=True)

    dff['ppi'] = dff['price']/dff['no of items']


    final_df = pd.get_dummies(dff, columns=['sub-class', 'delivery type'])
    return final_df

In [4]:
train_df = preprocessing(df)
y = train_df['profit']
X = train_df.drop(columns=['profit', 'price'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [6]:
#import all necessary libraries

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
import catboost
from catboost import CatBoostRegressor

In [10]:
scaler = MinMaxScaler(feature_range=(-1,1))
Xsc = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(Xsc, y, random_state=42)

# Hyperparameters tuning

In [33]:
model_params = {
    
    'etr' : {
        'model': ExtraTreesRegressor(),
        'params': {
            'n_estimators': [100, 500, 800, 1000, 1500],
            'max_features': ['sqrt', 'log2'],
        }
    }
}

In [34]:
scores = []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(Xsc, y)
    
    scores.append({
        'model': model_name,
        'best_score' : clf.best_score_,
        'best_params' : clf.best_params_
    })
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,etr,0.672765,"{'max_features': 'log2', 'n_estimators': 1500}"


In [None]:
reg = GridSearchCV(ExtraTreesRegressor(), {
    'n_estimators': [100, 500, 800],
    'max_features': ['sqrt'],
    'criterion': ['mae', 'mse']
}, cv=5, return_train_score=False)

reg.fit(Xsc,y)
print(pd.DataFrame(reg.cv_results_)[['params','mean_test_score']].sort_values(by=['mean_test_score'], ascending=False).head(3))
best_params_rf = reg.best_params_
print(best_params_rf)

# training model

In [26]:
model_info = []

In [27]:
def train_model(model, model_name):
    reg = model.fit(X_train, y_train)
    
#     cv_score = cross_val_score(reg, X_train, y_train, cv=10, scoring='r2')
    
    print(f'model info for {model_name.upper()} regressor : ')
    print('---------------------------------------------------------------------')

#     print(f'Cross-validation : {cv_score}')
#     print(f'mean : {np.mean(cv_score)}')
#     print(f'median : {np.median(cv_score)}')
#     print(f'std : {np.std(cv_score)}\n')
    print(f'train score : {reg.score(X_train, y_train)}')
    print(f'test score : {reg.score(X_test, y_test)}')
    
    model_info.append({
        'model': model_name,
#         'mean_cv_score': np.mean(cv_score),
#         'median': np.median(cv_score),
        'train_score': reg.score(X_train, y_train),
        'test_score': reg.score(X_test, y_test)
    })

In [28]:
train_model(CatBoostRegressor(logging_level="Silent", objective="MAE", eta=0.1, n_estimators=4500, subsample=1), 'catboost')

model info for CATBOOST regressor : 
---------------------------------------------------------------------
train score : 0.977545401969897
test score : 0.688137745722171


In [35]:
train_model(ExtraTreesRegressor(n_estimators = 1500, max_features ='log2'), 'extraTrees')

model info for EXTRATREES regressor : 
---------------------------------------------------------------------
train score : 0.9999126091262234
test score : 0.3565125977049467


In [36]:
pd.DataFrame(model_info)

Unnamed: 0,model,train_score,test_score
0,catboost,0.977545,0.688138
1,extraTrees,0.411323,0.033597
2,extraTrees,0.999913,0.351938
3,extraTrees,0.999913,0.356513


In [37]:
etr = ExtraTreesRegressor(n_estimators = 1500, max_features ='log2').fit(Xsc, y)

In [38]:
cbr = CatBoostRegressor(logging_level="Silent", objective="MAE", eta=0.1, n_estimators=4500, subsample=1).fit(Xsc, y)

## evaluate model by test data

In [97]:
test_data = pd.read_csv('test.csv')

In [98]:
dff = test_data[['id', 'discount', 'no of items', 'price', 'sub-class', 'delivery type']]
    
dff.dropna(inplace=True)

dff['ppi'] = dff['price']/dff['no of items']
dff.drop(columns='price', inplace=True)

test_df = pd.get_dummies(dff, columns=['sub-class', 'delivery type'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [99]:
test_df.columns

Index(['id', 'discount', 'no of items', 'ppi', 'sub-class_battery',
       'sub-class_charger', 'sub-class_chocolates', 'sub-class_colddrinks',
       'sub-class_fastfood', 'sub-class_headset', 'sub-class_hoodies',
       'sub-class_laptop', 'sub-class_lighting', 'sub-class_pants',
       'sub-class_phone', 'sub-class_shorts', 'sub-class_sweets',
       'sub-class_tablet', 'sub-class_television', 'sub-class_tshirts',
       'sub-class_watch', 'delivery type_First Class',
       'delivery type_Same Day', 'delivery type_Second Class',
       'delivery type_Standard Class'],
      dtype='object')

In [100]:
X.columns

Index(['discount', 'no of items', 'ppi', 'sub-class_battery',
       'sub-class_charger', 'sub-class_chocolates', 'sub-class_colddrinks',
       'sub-class_fastfood', 'sub-class_headset', 'sub-class_hoodies',
       'sub-class_laptop', 'sub-class_lighting', 'sub-class_pants',
       'sub-class_phone', 'sub-class_shorts', 'sub-class_sweets',
       'sub-class_tablet', 'sub-class_television', 'sub-class_tshirts',
       'sub-class_watch', 'delivery type_First Class',
       'delivery type_Same Day', 'delivery type_Second Class',
       'delivery type_Standard Class'],
      dtype='object')

In [101]:
id = test_data.id
test_df.drop(columns=['id'], inplace=True)

In [102]:
print(test_df.shape)
print(Xsc.shape)

(3294, 24)
(6699, 24)


In [103]:
test_df = scaler.transform(test_df)

In [105]:
etr_output = etr.predict(test_df)

etr_op = pd.DataFrame({
    'id' : id,
    'profit' : etr_output
})

In [107]:
cbr_output = cbr.predict(test_df)

cbr_op = pd.DataFrame({
    'id' : id,
    'profit' :cbr_output
})

In [111]:
cbr_op

Unnamed: 0,id,profit
0,6701,7.252356
1,6702,34.101437
2,6703,1.123307
3,6704,3.901357
4,6705,4.983583
...,...,...
3289,9990,7.180028
3290,9991,9.333608
3291,9992,-4.935844
3292,9993,4.935465


In [116]:
final_output = pd.DataFrame({
    'id': id,
    'profit': (etr_op['profit'] + cbr_op['profit'])/2
})

In [118]:
final_output.to_csv('submission_01.csv', index=False)

In [119]:
final_output

Unnamed: 0,id,profit
0,6701,7.341870
1,6702,30.783865
2,6703,1.406454
3,6704,4.590744
4,6705,6.135891
...,...,...
3289,9990,9.812805
3290,9991,9.356704
3291,9992,1.512493
3292,9993,6.871001
