In [32]:
import lightgbm as lgb
import xgboost as xgb
import os
import gc
import json
import time
from datetime import datetime
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [33]:
INPUT_TRAIN = "F:/ML PROJECT/data/train_v2.csv"

TRAIN='train-processed.csv'
Y='y.csv'

In [34]:
def load_df(csv_path=INPUT_TRAIN, nrows=90000):
    print("Loading "+csv_path)
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'},
                     nrows=nrows)
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df


def process_dfs(train_df):
    print("Processing dfs...")
    print("Dropping repeated columns...")
    columns = [col for col in train_df.columns if train_df[col].nunique() > 1]
    
    train_df = train_df[columns]
    

    trn_len = train_df.shape[0]
    merged_df = pd.concat([train_df])

    merged_df['diff_visitId_time'] = merged_df['visitId'] - merged_df['visitStartTime']
    merged_df['diff_visitId_time'] = (merged_df['diff_visitId_time'] != 0).astype(int)
    del merged_df['visitId']

    print("Generating date columns...")
    format_str = '%Y%m%d' 
    merged_df['formated_date'] = merged_df['date'].apply(lambda x: datetime.strptime(str(x), format_str))
    merged_df['WoY'] = merged_df['formated_date'].apply(lambda x: x.isocalendar()[1])
    merged_df['month'] = merged_df['formated_date'].apply(lambda x:x.month)
    merged_df['quarter_month'] = merged_df['formated_date'].apply(lambda x:x.day//8)
    merged_df['weekday'] = merged_df['formated_date'].apply(lambda x:x.weekday())

    del merged_df['date']
    del merged_df['formated_date']

    merged_df['formated_visitStartTime'] = merged_df['visitStartTime'].apply(
        lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x)))
    merged_df['formated_visitStartTime'] = pd.to_datetime(merged_df['formated_visitStartTime'])
    merged_df['visit_hour'] = merged_df['formated_visitStartTime'].apply(lambda x: x.hour)

    del merged_df['visitStartTime']
    del merged_df['formated_visitStartTime']

    print("Encoding columns with pd.factorize()")
    for col in merged_df.columns:
        if col in ['fullVisitorId', 'month', 'quarter_month', 'weekday', 'visit_hour', 'WoY']: continue
        if merged_df[col].dtypes == object or merged_df[col].dtypes == bool:
            merged_df[col], indexer = pd.factorize(merged_df[col])

    print("Splitting back...")
    train_df = merged_df[:trn_len]
    return train_df

def preprocess():
    train_df = load_df()


    target = train_df['totals.transactionRevenue'].fillna(0).astype(float)
    target = target.apply(lambda x: np.log1p(x))
    del train_df['totals.transactionRevenue']

    train_df = process_dfs(train_df)
    train_df.to_csv(TRAIN, index=False)
    target.to_csv(Y, index=False)


In [35]:
%%time
preprocess()

Loading F:/ML PROJECT/data/train_v2.csv
Loaded train_v2.csv. Shape: (90000, 59)
Processing dfs...
Dropping repeated columns...
Generating date columns...
Encoding columns with pd.factorize()
Splitting back...
Wall time: 39.6 s


In [36]:
def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

def load_preprocessed_dfs(drop_full_visitor_id=True):
    """
    Loads files `TRAIN`, `TEST` and `Y` generated by preprocess() into variables
    """
    X_train = pd.read_csv(TRAIN, converters={'fullVisitorId': str})
    y_train = pd.read_csv(Y, names=['LogRevenue']).T.squeeze()
    if drop_full_visitor_id: 
        X_train = X_train.drop(['fullVisitorId'], axis=1)
    return X_train, y_train

In [37]:
X, y = load_preprocessed_dfs()
X_train,X_test,y_train,y_test= train_test_split(X, y, test_size=0.20, random_state=0)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print(f"Test shape: {y_test.shape}")

Train shape: (72000, 37)
Test shape: (18000, 37)
Test shape: (18000,)


In [38]:
from sklearn.tree import DecisionTreeRegressor
regr_2 = DecisionTreeRegressor(max_depth=5)
regr_2.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [39]:
y_preddt=regr_2.predict(X_test)
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_preddt})  
df  
df.to_csv("F:/result.csv")

In [40]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_preddt))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_preddt))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_preddt)))  

Mean Absolute Error: 0.00892677310712549
Mean Squared Error: 0.014276394060909534
Root Mean Squared Error: 0.11948386527439399


In [41]:
regr_3 = DecisionTreeRegressor(max_depth=10)
regr_3.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [42]:
y_predr=regr_3.predict(X_test)
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_predr})  
df  
df.to_csv("F:/result1.csv")

In [43]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predr))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predr))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_predr)))  

Mean Absolute Error: 0.010466840405602041
Mean Squared Error: 0.018642096831211513
Root Mean Squared Error: 0.1365360642145932


In [44]:
xgboost = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

In [45]:
xgboost.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.08, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.75)

In [46]:
from sklearn.metrics import explained_variance_score
predictions = xgboost.predict(X_test)

In [47]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predictions))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))  

Mean Absolute Error: 0.008319183060689019
Mean Squared Error: 0.012108989174894898
Root Mean Squared Error: 0.11004085229992949


In [48]:
import lightgbm as lgb
d_train = lgb.Dataset(X_train, label=y_train)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression '
params['metric'] = 'mse'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10
clf = lgb.train(params, d_train, 100)

In [49]:
y_predl=clf.predict(X_test)
for i in range(0,99):
    if y_predl[i]>=.5:       
       y_predl[i]=1
    else:  
       y_predl[i]=0

In [50]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predl))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predl))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_predl)))  

Mean Absolute Error: 0.2787510722253498
Mean Squared Error: 1.9285947919841557
Root Mean Squared Error: 1.3887385614233356


In [51]:
from sklearn.svm import SVR
clf = SVR(gamma='scale', C=1.0, epsilon=0.2)
clf.fit(X_train, y_train) 

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [52]:
predsvr=clf.predict(X_test)

In [53]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predsvr))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predsvr))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predsvr)))  

Mean Absolute Error: 0.3729817732611874
Mean Squared Error: 3.120046939821897
Root Mean Squared Error: 1.766365460436174
