In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectFromModel

In [2]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,Id,spend,X1,X2,X3,X4,X5,X6,X7,X8,...,X121,X122,X123,X124,X125,X126,X127,X128,X129,X130
0,1294,87.78,0.919559,5,0.767277,0.527667,0.663123,5,0.354308,4,...,0.028279,58,-0.277899,5,-1.951704,5,0.604964,2,0.87,4
1,3330,417.54,0.299191,3,1.50512,0.064536,0.152453,2,0.737194,9,...,-1.246903,143,-1.845731,4,1.996854,3,0.940476,4,0.62,7
2,8437,4.74,0.728089,5,0.09153,0.214224,4.479039,6,0.119507,6,...,-0.737271,357,-0.444177,2,-0.878652,4,0.999782,4,0.94,6
3,965,97.27,0.724195,3,0.303376,0.120158,2.27887,4,1.674182,9,...,0.189367,43,-1.042953,6,0.422634,5,0.286419,5,0.38,6
4,1767,30.84,0.433751,3,1.288729,0.151632,1.103588,4,0.022785,4,...,-1.374196,78,1.362921,1,0.684774,3,0.548782,4,0.5,8


In [3]:
df_train['Id'].nunique()

5204

In [4]:
df_train.shape

(5204, 132)

In [5]:
X = df_train.drop(['Id', 'spend'], axis=1)
y = df_train['spend']

In [6]:
X.dtypes.unique()

array([dtype('float64'), dtype('int64'), dtype('O')], dtype=object)

In [7]:
np.arange(130)[X.dtypes == 'O']

array([ 50, 108])

In [8]:
X.iloc[:,50] # one hot encoding (month)

0         Feb
1         May
2         Dec
3         Feb
4       March
        ...  
5199      Aug
5200      Nov
5201     Sept
5202      Dec
5203      Dec
Name: X51, Length: 5204, dtype: object

In [9]:
X.iloc[:,108].unique() # one hot encoding (weather)

array(['clear', 'cloudy/misty', 'light rain/snow', 'heavy rain/snow'],
      dtype=object)

In [14]:
X.iloc[:,50].unique()

array(['Feb', 'May', 'Dec', 'March', 'Aug', 'July', 'June', 'Sept',
       'April', 'Jan', 'Oct', 'Nov'], dtype=object)

In [10]:
enc = OneHotEncoder()
X_encoded = pd.concat([X, pd.DataFrame(enc.fit_transform(X[['X51', 'X109']]).toarray())], axis=1)
X_encoded = X_encoded.drop(['X51', 'X109'], axis=1)

In [15]:
X_encoded

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,6,7,8,9,10,11,12,13,14,15
0,0.919559,5,0.767277,0.527667,0.663123,5,0.354308,4,0.1940,0.548686,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.299191,3,1.505120,0.064536,0.152453,2,0.737194,9,0.4478,0.425385,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.728089,5,0.091530,0.214224,4.479039,6,0.119507,6,0.2836,0.245031,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.724195,3,0.303376,0.120158,2.278870,4,1.674182,9,0.1642,0.900082,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.433751,3,1.288729,0.151632,1.103588,4,0.022785,4,0.3284,0.922927,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5199,0.134553,2,0.907356,0.355256,0.300539,5,1.407920,4,0.1343,0.865887,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5200,0.676594,5,1.379912,0.589060,2.860463,3,1.551198,2,0.4627,0.496456,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
5201,0.853983,6,0.094023,0.696752,0.691726,3,0.188240,6,0.1642,0.881448,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
5202,0.480869,5,0.910970,0.025219,0.900026,6,3.051571,4,0.1642,0.440449,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y, test_size=0.3, random_state=42)

In [20]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [21]:
X_train.columns = X_train.columns.astype(str)
RF = RandomForestRegressor(n_estimators=100, max_depth=15).fit(X_train, y_train)
RF.score(X_test, y_test)

0.9987861567064764

In [22]:
sel = SelectFromModel(RandomForestRegressor(n_estimators=100, max_depth=10))
sel.fit(X_train, y_train)

In [23]:
X_train.columns[sel.get_support()]

Index(['X23', 'X101'], dtype='object')

In [24]:
X_train_selected = X_train[['X23', 'X101']]
X_test_selected = X_test[['X23', 'X101']]

LR = LinearRegression().fit(X_train_selected, y_train)
LR.score(X_test_selected, y_test)

0.9999999999138333

In [25]:
X_encoded_selected = X_encoded[['X23', 'X101']]

LR_full = LinearRegression().fit(X_encoded_selected, y)
LR_full.score(X_encoded_selected, y)

0.999999999919146

# prediction

In [26]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,Id,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X121,X122,X123,X124,X125,X126,X127,X128,X129,X130
0,6882,0.771529,9,3.866913,0.875137,1.178556,4,1.072548,3,0.4925,...,-0.498453,292,-1.728426,4,0.396087,5,0.32031,2,0.7,6
1,1643,0.019751,6,0.908369,0.998922,3.156672,5,0.04272,3,0.2239,...,0.873936,72,-1.415597,7,-0.204529,5,0.381549,6,0.65,3
2,4417,0.401443,8,0.080149,0.155562,0.801341,4,0.051964,4,0.1642,...,0.414044,188,0.332442,1,-0.549344,4,0.172268,3,0.62,3
3,6940,0.065122,2,4.565464,0.412285,0.359151,6,0.744402,5,0.2537,...,1.598134,294,-0.489875,4,-0.434907,3,0.6865,3,0.54,5
4,7370,0.189363,5,0.203273,0.710065,1.526602,4,0.506877,4,0.1343,...,1.45741,312,0.338231,6,-0.208131,5,0.553829,4,0.46,2


### 1

In [95]:
X_subm = df_test.drop('Id', axis=1)
X_subm_encoded = pd.concat([X_subm, pd.DataFrame(enc.transform(X_subm[['X51', 'X109']]).toarray())], axis=1)
X_subm_encoded = X_subm_encoded.drop(['X51', 'X109'], axis=1)

In [96]:
test_prediction = RF.predict(X_subm_encoded)
test_prediction

array([ 501.4066   ,   67.162    ,  433.7482381, ...,  363.9725   ,
       1228.1916   ,  418.22719  ])

# 2 (linear regression with selected features)

In [27]:
X_subm2 = df_test[['X23', 'X101']]
test_prediction2 = LR_full.predict(X_subm2)

# submission

In [29]:
submission = pd.read_csv('submission.csv')
submission['Expected'] = test_prediction2
submission

Unnamed: 0,Id,Expected
0,6882,498.203893
1,1643,66.427025
2,4417,431.776660
3,6940,941.842735
4,7370,685.623496
...,...,...
3436,6861,241.984633
3437,6435,854.063902
3438,6746,367.721854
3439,4508,1219.413567


In [30]:
submission.to_csv('submission_LR_full.csv', index=False)