Import required libraries.

In [74]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

1. Prepare the data.

In [75]:
region_0 = pd.read_csv('datasets/geo_data_0.csv')
region_1 = pd.read_csv('datasets/geo_data_1.csv')
region_2 = pd.read_csv('datasets/geo_data_2.csv')

In [76]:
region_0.head()

Unnamed: 0,id,f0,f1,f2,product
0,txEyH,0.705745,-0.497823,1.22117,105.280062
1,2acmU,1.334711,-0.340164,4.36508,73.03775
2,409Wp,1.022732,0.15199,1.419926,85.265647
3,iJLyR,-0.032172,0.139033,2.978566,168.620776
4,Xdl7t,1.988431,0.155413,4.751769,154.036647


In [77]:
region_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       100000 non-null  object 
 1   f0       100000 non-null  float64
 2   f1       100000 non-null  float64
 3   f2       100000 non-null  float64
 4   product  100000 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB


There are no missing values.

**Feature Selection:**

For machine learning models, all data has to be numerical. Therefore I will drop the `id` column, which has an object datatype. It is also not significant in learning and predicting.

In [78]:
region_0.drop('id', axis=1, inplace=True)
region_1.drop('id', axis=1, inplace=True)
region_2.drop('id', axis=1, inplace=True)

2. Train and test the model for each region.

In [79]:
def data_splitter(data):
    data_train, data_valid = train_test_split(data, test_size=0.25, random_state=0)
    features_valid = data_valid.drop(['product'], axis=1)
    target_valid = data_valid['product']
    features_train = data_train.drop(['product'], axis=1)
    target_train = data_train['product']
    return features_train, features_valid, target_train, target_valid

In [80]:
def decisiontree(features_train, features_valid, target_train, target_valid):
    print("Decision Tree Regressor:")
    for i in range(5, 11):
        dt_model = DecisionTreeRegressor(random_state = 0, max_depth = i)
        dt_model.fit(features_train, target_train)
        dt_predictions = dt_model.predict(features_valid)
        dt_mse = mean_squared_error(target_valid, dt_predictions)
        dt_rmse = dt_mse ** 0.5
        print('Max_depth =', i, ':', dt_rmse)
        print('Average volume:', dt_predictions.mean())        

In [81]:
def randomforest(features_train, features_valid, target_train, target_valid):
    print('Random Forest Regressor:')
    for i in range(80, 101, 10):
        rf_model = RandomForestRegressor(n_estimators=i, max_depth=10, random_state=0)
        rf_model.fit(features_train, target_train)
        rf_predictions = rf_model.predict(features_valid)
        rf_mse = mean_squared_error(target_valid, rf_predictions)
        rf_rmse = rf_mse ** 0.5
        print('n_estimators =', i, ':', rf_rmse)
        print('Average volume:', rf_predictions.mean())        

In [82]:
def linearreg(features_train, features_valid, target_train, target_valid):
    print('Linear Regression:')
    l_model = LinearRegression()
    l_model.fit(features_train, target_train)
    l_predictions = l_model.predict(features_valid)
    l_mse = mean_squared_error(target_valid, l_predictions)
    l_rmse = l_mse ** 0.5
    print(l_rmse)
    print('Average volume:', l_predictions.mean())

- `region_0`:

In [83]:
features_train_0, features_valid_0, target_train_0, target_valid_0 = data_splitter(region_0)

In [84]:
print(features_train_0.shape, features_valid_0.shape, target_train_0.shape, target_valid_0.shape)

(75000, 3) (25000, 3) (75000,) (25000,)


In [85]:
decisiontree(features_train_0, features_valid_0, target_train_0, target_valid_0)

Decision Tree Regressor:
Max_depth = 5 : 37.60704078717036
Average volume: 92.2988191532641
Max_depth = 6 : 37.499165239183526
Average volume: 92.30046852195603
Max_depth = 7 : 37.480349495423376
Average volume: 92.32853767709977
Max_depth = 8 : 37.45254049250312
Average volume: 92.38579048987899
Max_depth = 9 : 37.50970694816511
Average volume: 92.35797155232336
Max_depth = 10 : 37.86207789178569
Average volume: 92.28788804308863


In [86]:
randomforest(features_train_0, features_valid_0, target_train_0, target_valid_0)

Random Forest Regressor:
n_estimators = 80 : 37.07077888433954
Average volume: 92.35406621279171
n_estimators = 90 : 37.06866205269521
Average volume: 92.35106641708886
n_estimators = 100 : 37.068345522546686
Average volume: 92.34708757232663


In [87]:
linearreg(features_train_0, features_valid_0, target_train_0, target_valid_0)

Linear Regression:
37.48100896950594
Average volume: 92.27144852242301


`region_1`:

In [88]:
features_train_1, features_valid_1, target_train_1, target_valid_1 = data_splitter(region_1)

In [89]:
print(features_train_1.shape, features_valid_1.shape, target_train_1.shape, target_valid_1.shape)

(75000, 3) (25000, 3) (75000,) (25000,)


In [90]:
decisiontree(features_train_1, features_valid_1, target_train_1, target_valid_1)

Decision Tree Regressor:
Max_depth = 5 : 0.7234223810500692
Average volume: 69.15169116311066
Max_depth = 6 : 0.7155012207896717
Average volume: 69.15279523815116
Max_depth = 7 : 0.7169296910411338
Average volume: 69.1530554346386
Max_depth = 8 : 0.7234571631887997
Average volume: 69.15171428886366
Max_depth = 9 : 0.7337813559997138
Average volume: 69.15124783456014
Max_depth = 10 : 0.7513219600820122
Average volume: 69.15259478464417


In [91]:
randomforest(features_train_1, features_valid_1, target_train_1, target_valid_1)

Random Forest Regressor:
n_estimators = 80 : 0.7089024882038095
Average volume: 69.15238928896179
n_estimators = 90 : 0.7086979666771084
Average volume: 69.15228150722687
n_estimators = 100 : 0.7087338815839226
Average volume: 69.15209056557245


In [92]:
linearreg(features_train_1, features_valid_1, target_train_1, target_valid_1)

Linear Regression:
0.8872573052219321
Average volume: 69.15162398290752


`region_2`:

In [93]:
features_train_2, features_valid_2, target_train_2, target_valid_2 = data_splitter(region_2)

In [94]:
print(features_train_2.shape, features_valid_2.shape, target_train_2.shape, target_valid_2.shape)

(75000, 3) (25000, 3) (75000,) (25000,)


In [95]:
decisiontree(features_train_2, features_valid_2, target_train_2, target_valid_2)

Decision Tree Regressor:
Max_depth = 5 : 39.258382844928946
Average volume: 94.88567280925298
Max_depth = 6 : 38.602931763812464
Average volume: 94.84114167029453
Max_depth = 7 : 38.384271336669734
Average volume: 94.84932397266535
Max_depth = 8 : 38.40487407183288
Average volume: 94.84316253928151
Max_depth = 9 : 38.46149643569178
Average volume: 94.85223685752071
Max_depth = 10 : 38.60249701701302
Average volume: 94.79669338438578


In [96]:
randomforest(features_train_2, features_valid_2, target_train_2, target_valid_2)

Random Forest Regressor:
n_estimators = 80 : 37.88556336647621
Average volume: 94.80347682301365
n_estimators = 90 : 37.88526122376665
Average volume: 94.80568580378207
n_estimators = 100 : 37.883943464867194
Average volume: 94.79611948193084


In [97]:
linearreg(features_train_2, features_valid_2, target_train_2, target_valid_2)

Linear Regression:
40.31290686044374
Average volume: 94.70753129105672
