#Load Dataset

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
df= pd.read_csv('/content/drive/My Drive/retail_data/train.tsv',delimiter='\t')
test= pd.read_csv('/content/drive/My Drive/retail_data/test.tsv',delimiter='\t')

#Combine datasets & pre processing

In [0]:
df.shape, test.shape

In [0]:
df.columns, test.columns

In [0]:
#adding another column
df['is_train'] = 1
test['is_train'] = 0

In [0]:
#Rename ID column 
df.rename(columns={'train_id':'id'}, inplace=True)
test.rename(columns={'test_id':'id'}, inplace=True)

In [0]:
#Taking off price column from df
df = df.drop(['price'], axis =1)

In [0]:
df.shape, test.shape

In [0]:
df.columns, test.columns

In [0]:
#Combine the datsets
df_test_combine = pd.concat([df, test], axis = 0)
df_test_combine.shape

(2175894, 8)

In [0]:
df_test_combine.tail()

In [0]:
df_test_combine.dtypes

In [0]:
# What are catagorical columns?
cat_vars = ['category_name', 'brand_name', 'shipping', 'name', 'item_description','item_condition_id']

In [0]:
#convert columns to categorical
for x in cat_vars:
  df_test_combine[x] = df_test_combine[x].astype('category').cat.as_ordered()

In [0]:
df_test_combine.dtypes

In [0]:
df_test_combine.tail()

In [0]:
#create codes for Categorical veriables
for x in cat_vars:
  df_test_combine[x] = df_test_combine[x].cat.codes

In [0]:
df_test_combine.shape

In [0]:
df_test_combine.head()

#Break processed dataset

In [0]:
# Break processed dataset in to 2 
#df_train = df_test_combine.loc[df_test_combine['is_train]'==1]]

df_train = df_test_combine[df_test_combine.is_train==1]
df_test = df_test_combine[df_test_combine.is_train==0]

In [0]:
df_train.shape, df_test.shape

((1482535, 8), (693359, 8))

In [0]:
df.columns

In [0]:
df1= pd.read_csv('/content/drive/My Drive/retail_data/train.tsv',delimiter='\t')

In [0]:
df_train['price'] = df1.price

In [0]:
df_train.columns

In [0]:
# Taking log of ind veriable because outcome is measured in RMSLE...
df_train['price'] = df_train['price'].apply(lambda x: np.log(x) if x>0 else x)
#Or use df_train['price'] = df_train['price'].apply(lambda x: np.log(x+1))

In [0]:
x_train, y_train = df_train.drop(['price'], axis = 1), df_train.price

In [0]:
x_train.shape, y_train.shape

((1482535, 8), (1482535,))

#Start training

In [0]:
#Import Lib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

#Optional

In [0]:
from xgboost import XGBRegressor

In [0]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=30)
%time m.fit(x_train, y_train)

In [0]:
%time m.fit(x_train, y_train)

In [0]:
%time m.score(x_train,y_train)

In [0]:
#We can use grid and try many options before deciding
grid = {
    'min_samples_leaf': [15,25,50,100],
    'max_features': ['sqrt', 'log2', 0.5,]}

In [0]:
gd = GridSearchCV(rf,grid, cv=3, verbose=50)

In [0]:
gd.fit(x_train, y_train)

In [0]:
gd.best_estimator_

#RF Regressor

In [0]:
rf = RandomForestRegressor(n_jobs=-1, n_estimators=50,  random_state=42, max_features=0.5, min_samples_leaf=3)

In [0]:
#Fitting
%time rf.fit(x_train, y_train)

CPU times: user 7min 11s, sys: 1.61 s, total: 7min 13s
Wall time: 3min 39s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=0.5, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                      oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [0]:
#Scoring
rf.score(x_train, y_train)

0.7801122908097053

In [0]:
df_test.shape

In [0]:
#Predicting
preds = rf.predict(df_test)

In [0]:
preds

array([2.63327711, 2.24173695, 3.20770073, ..., 2.10997499, 2.76160145,
       2.28134616])

In [0]:
#Reversing log
preds = pd.Series(np.exp(preds))


In [0]:
preds.head()

#Submission

In [0]:
test.id

In [0]:
submit = pd.concat([test.id,preds],axis=1)

In [0]:
#Submiting
submit.to_csv('submission_rf.csv', index= False)