In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train = pd.read_csv('data/diamonds_train.csv')
test = pd.read_csv('data/diamonds_test.csv')
sample_sub = pd.read_csv('data/sample_submission.csv')

In [3]:
cut_dict={'Ideal':5,
'Premium':4,
'Very Good':3,
'Good':2,
'Fair':1}

clarity_dict={'IF':8,
'VVS1':7,
'VVS2':6,
'VS1':5,
'VS2':4,
'SI1':3,
'SI2':2,
'I1':1}

color_dict={'D':7,
'E':6,
'F':5,
'G':4,
'H':3,
'I':2,
'J':1}

train['num_cut']=train.cut.apply(lambda x: cut_dict[x])
train['num_clarity']=train.clarity.apply(lambda x: clarity_dict[x])
train['num_color']=train.color.apply(lambda x: color_dict[x])
train['cut/carat']=train.num_cut/train.carat
train['clarity/carat']=train.num_clarity/train.carat
train['color/carat']=train.num_color/train.carat
train['depth/carat']=train.depth/train.carat
train['table/carat']=train.table/train.carat

test['num_cut']=test.cut.apply(lambda x: cut_dict[x])
test['num_clarity']=test.clarity.apply(lambda x: clarity_dict[x])
test['num_color']=test.color.apply(lambda x: color_dict[x])
test['cut/carat']=test.num_cut/test.carat
test['clarity/carat']=test.num_clarity/test.carat
test['color/carat']=test.num_color/test.carat
test['depth/carat']=test.depth/test.carat
test['table/carat']=test.table/test.carat

In [4]:
def filler_xyz(carat,depth,x,y,z):
    if x==y==z==0:
        x=(carat*100/(0.006*depth))**(1/3)
        y=x
        z=depth*x/100
    if x==z==0:
        x=y
        z=depth*y/100
    if z==0:
        z=depth*(x+y)/200
    return x,y,z

    
train['n'] = train.apply(lambda x: filler_xyz(x['carat'],x['depth'],x['x'],x['y'],x['z']), axis=1)
train['x'] = train.apply(lambda x: x['n'][0], axis=1)
train['y'] = train.apply(lambda x: x['n'][1], axis=1)
train['z'] = train.apply(lambda x: x['n'][2], axis=1)

train.drop('n',axis=1,inplace=True)

In [5]:
test['n'] = test.apply(lambda x: filler_xyz(x['carat'],x['depth'],x['x'],x['y'],x['z']), axis=1)
test['x'] = test.apply(lambda x: x['n'][0], axis=1)
test['y'] = test.apply(lambda x: x['n'][1], axis=1)
test['z'] = test.apply(lambda x: x['n'][2], axis=1)

test.drop('n',axis=1,inplace=True)

In [6]:
target = 'price'

train['circular']=(train['x']+train['y'])/2
test['circular']=(test['x']+test['y'])/2

train['L/W']=train['x']/train['y'] 
train['L/W'].fillna(0,inplace=True)

test['L/W']=test['x']/test['y']
test['L/W'].fillna(0,inplace=True)

train['density']=train['carat']/(train['x']*train['y']*train['z'])
test['density']=test['carat']/(test['x']*test['y']*test['z'])

cat_features = ['cut', 'color', 'clarity']
num_features = ['carat', 'depth', 'table', 'x', 'y', 'z','L/W','circular','density',
               'num_cut', 'num_clarity', 'num_color', 'cut/carat', 'clarity/carat', 'color/carat',
               'depth/carat','table/carat']

for cat_feat in cat_features:
    train[cat_feat] = train[cat_feat].astype('category')
    test[cat_feat] = test[cat_feat].astype('category')
    
# cat_df = pd.get_dummies(train[cat_features])
# num_df = train.loc[:,num_features]
# train_df = pd.concat([cat_df, num_df], axis=1)

# cat_df = pd.get_dummies(test[cat_features])
# num_df = test.loc[:,num_features]
# test_df = pd.concat([cat_df, num_df], axis=1)


# features = list(cat_df.columns) + list(num_df.columns)

In [None]:
train[train['density']>0.01]

In [None]:
plt.hist(train['density'], bins=5)

In [None]:
cat_df = pd.get_dummies(train[cat_features])
num_df = train.loc[:,num_features]
train_df = pd.concat([cat_df, num_df], axis=1)

cat_df = pd.get_dummies(test[cat_features])
num_df = test.loc[:,num_features]
test_df = pd.concat([cat_df, num_df], axis=1)


features = list(cat_df.columns) + list(num_df.columns)


data_pca = train_df[[x for x in train_df.columns if x != 'price']]

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_rescaled = scaler.fit_transform(data_pca)

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca = PCA().fit(data_rescaled)

plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)')
plt.ylim([0,1])
plt.title('CumSum')
plt.show()


In [None]:
dataset=pd.DataFrame(PCA(n_components=2).fit_transform(data_pca))
dataset.info()

In [None]:
plt.scatter(dataset[0], dataset[1])

In [None]:
filt1=dataset[1]>8
filt2=dataset[0]>14
filt3=dataset[1]<-7.5

dropping=dataset[filt1|filt2|filt3].index
train.drop(dropping, axis=0,inplace=True)

In [None]:
train.info()

In [None]:
display(train[14708:14709])
display(train[21603:21604])

train.drop(21603,axis=0,inplace=True)
train.drop(14708,axis=0,inplace=True)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


In [8]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

In [9]:
# categorical_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant', fill_value='missing'))])
categorical_transformer =Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [10]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                ('cat', categorical_transformer, cat_features)])

In [11]:
final_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [12]:
features=num_features+cat_features
X=train[features]
y=train[target]

In [16]:
from lightgbm import LGBMRegressor

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('scaler', final_transformer),
                       ('regressor', LGBMRegressor(boosting='dart',
                                                   n_estimators=1000,
                                                   max_depth=150,
                                                   num_leaves=80,
                                                   n_jobs=-1))])
# model.fit(X=X, y=y)
# print("model created!")

from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=10, verbose=2)
np.mean(-scores)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  39.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   39.0s remaining:    0.0s


[CV] END .................................................... total time=  49.6s
[CV] END .................................................... total time=  40.5s
[CV] END .................................................... total time=  38.1s
[CV] END .................................................... total time=  46.2s
[CV] END .................................................... total time=  42.4s
[CV] END .................................................... total time=  40.3s
[CV] END .................................................... total time=  43.1s
[CV] END .................................................... total time=  28.8s
[CV] END .................................................... total time=  24.5s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  6.5min finished


523.1096425046878

In [17]:
model.fit(X=X, y=y)
y_pred = model.predict(test[features])
if y_pred.min()<200:
    raise ValueError(f'price min: {y_pred.min()}')
submission_df = pd.DataFrame({'id': test['id'], 'price': y_pred})
submission_df.head()
submission_df.to_csv('submission_random_trial.csv', index=False)



In [None]:
test.head()

In [None]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [None]:
param_grid = {
    'regressor__learning_rate': [0.001, 0.01, 0.05, 0.1,0.2],
}

param_grid_dart = {
    'regressor__drop_rate': [0.1, 0.5, 0.9],
    'regressor__uniform_drop': [True,False],
    'regressor__drop_seed': [2,4,6],
    'regressor__max_drop': [20,50, 80],
    'regressor__skip_drop':[0.1,0.5,0.9]
}
# 'gbdt','dart', 'goss'
grid_search = GridSearchCV(model, 
                                 param_grid, 
                                 cv=10, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1
                                 )



In [None]:
from datetime import datetime
full=datetime.now()
hour=datetime.now().hour
minute=datetime.now().minute 
print(full)

grid_search.fit(X, y)

print(full)
print(f'elapsed time: {(datetime.now()-full)/60}')

In [None]:
print(grid_search.best_params_)
print(-grid_search.best_score_)

In [None]:
from pactools.grid_search import GridSearchCVProgressBar

gscv = GridSearchCVProgressBar(model, param_grid=param_grid, cv=10,
                               return_train_score=False, verbose=2)

In [None]:
# para retocar!!!!!!!!

def plot_results(index='dar__ordar', columns='dar__ordriv'):
    """Select two hyperparameters from which we plot the fluctuations"""
    index = 'param_' + index
    columns = 'param_' + columns

    # prepare the results into a pandas.DataFrame
    df = pd.DataFrame(gscv.cv_results_)

    # Remove the other by selecting their best values (from gscv.best_params_)
    other = [c for c in df.columns if c[:6] == 'param_']
    other.remove(index)
    other.remove(columns)
    for col in other:
        df = df[df[col] == gscv.best_params_[col[6:]]]

    # Create pivot tables for easy plotting
    table_mean = df.pivot_table(index=index, columns=columns,
                                values=['mean_test_score'])
    table_std = df.pivot_table(index=index, columns=columns,
                               values=['std_test_score'])

    # plot the pivot tables
    import matplotlib.pyplot as plt
    plt.figure()
    ax = plt.gca()
    for col_mean, col_std in zip(table_mean.columns, table_std.columns):
        table_mean[col_mean].plot(ax=ax, yerr=table_std[col_std], marker='o',
                                  label=col_mean)
    plt.title('Grid-search results (higher is better)')
    plt.ylabel('log-likelihood compared to an AR(0)')
    plt.legend(title=table_mean.columns.names)
    plt.show()


plot_results(index='dar__ordar', columns='dar__ordriv')
plot_results(index='driver__low_fq', columns='driver__low_fq_width')

In [None]:
#dart features

# drop_rate, default = 0.1, type = double, aliases: rate_drop, constraints: 0.0 <= drop_rate <= 1.0
# uniform_drop, default = false, type = bool
# drop_seed , default = 4, type = int
# max_drop , default = 50, type = int, <=0 means no limit
# skip_drop , default = 0.5, type = double, constraints: 0.0 <= skip_drop <= 1.0



In [None]:
#hypteropt

#https://www.scikit-yb.org/en/latest/api/model_selection/index.html

# **Submission**

In [None]:
y_pred = grid_search.predict(test[features])
if y_pred.min()<200:
    raise ValueError(f'price min: {y_pred.min()}')
submission_df = pd.DataFrame({'id': test['id'], 'price': y_pred})
submission_df.head()
submission_df.to_csv('submission_lGBM_grid.csv', index=False)