# Import Libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e8/sample_submission.csv
/kaggle/input/playground-series-s3e8/train.csv
/kaggle/input/playground-series-s3e8/test.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Training and Test Data

In [3]:
train_data = pd.read_csv("/kaggle/input/playground-series-s3e8/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s3e8/test.csv")
sample_submission = pd.read_csv("/kaggle/input/playground-series-s3e8/sample_submission.csv")

display(train_data.head())
display(test_data.head())
display(sample_submission.head())

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,193573,0.35,Ideal,D,VS2,62.3,56.0,4.51,4.54,2.82
1,193574,0.77,Very Good,F,SI2,62.8,56.0,5.83,5.87,3.68
2,193575,0.71,Ideal,I,VS2,61.9,53.0,5.77,5.74,3.55
3,193576,0.33,Ideal,G,VVS2,61.6,55.0,4.44,4.42,2.73
4,193577,1.2,Very Good,I,VS2,62.7,56.0,6.75,6.79,4.24


Unnamed: 0,id,price
0,193573,3969.155
1,193574,3969.155
2,193575,3969.155
3,193576,3969.155
4,193577,3969.155


# Missing Data

In [4]:
print(f"train_data has this many null cells: {train_data.isnull().sum().sum()}")
print(f"test_data has this many null cells: {test_data.isnull().sum().sum()}")

train_data has this many null cells: 0
test_data has this many null cells: 0


# Create Training Data

In [5]:
X = train_data
y = train_data.price

# Preprocess Data

In [6]:
def preprocess(data):

    
    # Credit to Sujay Kapadnis
    replacement = {'cut':{'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, "Ideal":4},
                    'color':{'J':0,'I':1,'H':2, 'G':3, 'F':4, 'E':5, "D":6},
                    'clarity':{'FL':10, 'IF':9, 'VVS1':8, 'VVS2':7, 'VS1':6, 'VS2':5, 
                    'SI1':4, 'SI2':3, 'I1':2, 'I2':1, 'I3':0}}
    
    if('id' in data.columns):
        data.drop('id', axis=1, inplace=True)
    if('price' in data.columns):
        data.drop('price', axis=1, inplace=True)
        
    data=data.replace(replacement)
    
    display(data.head())
    
    return data


X = preprocess(X)

display(X.head())

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,3,4,5,62.2,58.0,7.27,7.33,4.55
1,2.03,2,0,3,62.0,58.0,8.06,8.12,5.05
2,0.7,4,3,6,61.2,57.0,5.69,5.73,3.5
3,0.32,4,3,6,61.6,56.0,4.38,4.41,2.71
4,1.7,3,3,5,62.6,59.0,7.65,7.61,4.77


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,3,4,5,62.2,58.0,7.27,7.33,4.55
1,2.03,2,0,3,62.0,58.0,8.06,8.12,5.05
2,0.7,4,3,6,61.2,57.0,5.69,5.73,3.5
3,0.32,4,3,6,61.6,56.0,4.38,4.41,2.71
4,1.7,3,3,5,62.6,59.0,7.65,7.61,4.77


# Split Data To Training Data 

In [7]:
train_X, val_X, train_y, val_y = train_test_split(X, y)

# Tune Model

In [8]:
# Test different number of trees
highest_accuracy=1000000000
best_model=0
optimal_num_trees=-1

for num_trees in [5, 10, 50, 75, 100]:
    model = RandomForestRegressor(n_estimators=num_trees, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    mae = mean_absolute_error(val_y, predictions)
    if(mae<highest_accuracy):
        highest_accuracy=mae
        optimal_num_trees=num_trees
        best_model=model
    print(f'(num_trees) {optimal_num_trees} has accuracy {mae}')



# Test different max_leaf_nodes
# optimal_leaf_nodes=-1

# for max_leaf_nodes in [5, 10, 50, 75, 100]:
#     model = RandomForestRegressor(n_estimators=optimal_num_trees, max_leaf_nodes=max_leaf_nodes, random_state=1)
#     model.fit(train_X, train_y)
#     predictions = model.predict(val_X)
#     mae = mean_absolute_error(val_y, predictions)
#     if(mae<highest_accuracy):
#         highest_accuracy=mae
#         optimal_leaf_nodes=max_leaf_nodes
#         best_model=model
#     print(f'(max leaf nodes) {max_leaf_nodes} has accuracy {mae}')


print(f"Highest accurate: {highest_accuracy}")
print(f"Optimal number of trees: {optimal_num_trees}")
# print(f"Optimal number of leaves: {optimal_leaf_nodes}")

(num_trees) 5 has accuracy 338.68823351773244
(num_trees) 10 has accuracy 324.96823199747314
(num_trees) 50 has accuracy 312.81902373277285
(num_trees) 75 has accuracy 312.2063564105525
(num_trees) 100 has accuracy 311.5301814860789
Highest accurate: 311.5301814860789
Optimal number of trees: 100


# Create And Train Final Model

In [9]:
final_model = RandomForestRegressor(n_estimators=optimal_num_trees, random_state=1);
final_model.fit(X, y)

RandomForestRegressor(random_state=1)

# Predictions

In [10]:
to_test = test_data.copy()
to_test = preprocess(to_test)
predictions = final_model.predict(to_test)

to_test.head()
test_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.35,4,6,5,62.3,56.0,4.51,4.54,2.82
1,0.77,2,4,3,62.8,56.0,5.83,5.87,3.68
2,0.71,4,1,5,61.9,53.0,5.77,5.74,3.55
3,0.33,4,3,7,61.6,55.0,4.44,4.42,2.73
4,1.2,2,1,5,62.7,56.0,6.75,6.79,4.24


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,193573,0.35,Ideal,D,VS2,62.3,56.0,4.51,4.54,2.82
1,193574,0.77,Very Good,F,SI2,62.8,56.0,5.83,5.87,3.68
2,193575,0.71,Ideal,I,VS2,61.9,53.0,5.77,5.74,3.55
3,193576,0.33,Ideal,G,VVS2,61.6,55.0,4.44,4.42,2.73
4,193577,1.2,Very Good,I,VS2,62.7,56.0,6.75,6.79,4.24


# Submission

In [11]:
output = pd.DataFrame({'id': test_data.id, 'price': predictions})

output.head()

output.to_csv('submission.csv', index=False)

# output = sample_submission.copy()
# output['price'] = predictions
# output.to_csv('submission.csv', index=False)

# output.head()