In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv


# Improvements from Baseline
      - Remove high cardinality columns
      - Adjust train-test-split ratio
      - Increase n_estimators

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error

In [3]:
# Read datasets
train_data = pd.read_csv('/kaggle/input/30-days-of-ml/train.csv')
test_data = pd.read_csv('/kaggle/input/30-days-of-ml/test.csv')

In [4]:
# Check data
train_data.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


In [5]:
# Check shapes
train_data.shape, test_data.shape

((300000, 26), (200000, 25))

In [6]:
# Check columns
train_data.columns, test_data.columns

(Index(['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
        'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
        'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
        'cont13', 'target'],
       dtype='object'),
 Index(['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
        'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
        'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
        'cont13'],
       dtype='object'))

In [7]:
# Copy datasets for altering
train_full = train_data.copy()
test_full = test_data.copy()

In [8]:
# Checking Features to drop - Removing high cardinality
cardinality = {}
for col in train_full.columns:
    # Only concenred with object cols
    if train_full[col].dtypes == 'object':
        cardinality[col] = len(train_full[col].unique())
    
cardinality



{'cat0': 2,
 'cat1': 2,
 'cat2': 2,
 'cat3': 4,
 'cat4': 4,
 'cat5': 4,
 'cat6': 8,
 'cat7': 8,
 'cat8': 7,
 'cat9': 15}

In [9]:
# Cat9 needs to be dropped
train_full.drop('cat9', axis = 1, inplace = True)
test_full.drop('cat9', axis = 1, inplace = True)

In [10]:
# Seperate features and targets
y = train_full['target']
X = train_full.drop('target', axis = 1)

In [11]:
# Create train set and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size = 0.6, random_state = 0)

In [12]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((180000, 24), (120000, 24), (180000,), (120000,))

In [13]:
# Extract object cols
category_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
category_cols

['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8']

In [14]:
# Encoding Category Cols
enc = OrdinalEncoder()

cat_X_train = X_train.copy()
cat_X_valid = X_valid.copy()

cat_X_train[category_cols] = enc.fit_transform(X_train[category_cols])
cat_X_valid[category_cols] = enc.transform(X_valid[category_cols])
test_full[category_cols] = enc.transform(test_full[category_cols])

In [15]:
# Modelling
model = RandomForestRegressor(n_estimators = 500, random_state = 0)

model.fit(cat_X_train, y_train)
y_predict = model.predict(cat_X_valid)
score = mean_squared_error(y_valid, y_predict)
score

0.5372208599259335

In [16]:
test_predictions = model.predict(test_full)

In [17]:
output = pd.DataFrame({'Id': test_full['id'],
                      'Target':test_predictions})

In [18]:
output.to_csv('my_submission.csv', index = False)