<a href="https://github.com/AutoViML/lazytransform"><img src="https://i.ibb.co/KNXJk3g/lazy-logo5.png" alt="lazy-logo5" border="0"></a>

### lazytransform is a new python library for automatically transforming your entire dataset to numeric format using category encoders, NLP text vectorizers and pandas date time processing functions. All in a single line of code!

In [2]:
# This is A snippet for all data related tasks
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
np.random.seed(0)

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
!pip install lazytransform --ignore-installed --no-deps

In [7]:
!pip install category-encoders --ignore-installed --no-deps

In [8]:
from lazytransform import LazyTransformer

In [9]:
!pip install featurewiz --ignore-installed --no-deps

In [11]:
### you need xlrd since Kaggle for some reason doesn't have it
!pip install xlrd

In [12]:
#####   Use this for debugging/development version ############
import featurewiz as FW

In [13]:
from lazytransform import LazyTransformer

In [14]:
datapath = '/kaggle/input/tabular-playground-series-jan-2022/'
filename = 'train.csv'
trainfile = datapath+filename
df = pd.read_csv(trainfile)
print(df.shape)
df.head()

In [16]:
df['date'] = pd.to_datetime(df['date'])

In [17]:
target = ['num_sold']
encoders='auto'
modeltype = 'Regression'
scalers = ''

In [18]:
filename = 'test.csv'
testfile = datapath+filename
test = pd.read_csv(testfile)
print(test.shape)
test.head()

# LightGBM is better for multi-class than XGBoost. But XGBoost is better in binary class. Both are similar in Regression problems.

In [19]:
import lightgbm as lgbm
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
if modeltype == 'Regression':
    lgb = lgbm.LGBMRegressor(device="cpu")
    if isinstance(target, list):
        if len(target) > 1:
            lgb = MultiOutputRegressor(lgb)
else:
    lgb = lgbm.LGBMClassifier(device="cpu")
    if isinstance(target, list):
        if len(target) > 1:
            lgb = MultiOutputClassifier(lgb)

In [20]:
import featurewiz as FW

In [22]:
preds = [x for x in list(df) if x not in target]
len(preds)

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df[preds],df[target],
                                        test_size=0.2,random_state=99)
print(X_train.shape, X_valid.shape)

In [26]:
preds

In [27]:
X_test = test[preds]
print(X_test.shape)

# Baseline model with all features - let's see performance

In [28]:
print('all features = %d' %X_train.shape[1])

In [29]:
sim = LazyTransformer(model=lgb, encoders='auto',scalers='max', imbalanced=False)
sim

In [30]:
### If using a model in pipeline, use fit and predict only ###
sim.fit(X_train, y_train)

In [31]:
### If using a model in pipeline, use fit and predict only ###
predictions = sim.predict(X_valid)

In [32]:
import sklearn
if modeltype == 'Regression':
    print('R-squared = %0.0f%%' %(100*sklearn.metrics.r2_score(y_valid, predictions)))
    print('RMSE = %0.2f' %np.sqrt(sklearn.metrics.mean_squared_error(y_valid, predictions)))
else:
    print(sklearn.metrics.classification_report(y_valid, predictions))

# Let's compare it with a model with select features from featurewiz

In [33]:
sim = LazyTransformer(model=lgb, encoders='auto',scalers='max', imbalanced=False)
sim

In [34]:
X_train_trans, y_train_trans = sim.fit_transform(X_train, y_train)
X_train_trans.shape

In [35]:
X_test_trans = sim.transform(X_valid)
X_test_trans.shape

# If you have NLP vars, featurewiz will automatically drop them hence use lazytransform to create word vectors. Hence it is better to run featurewiz first and then lazytransform even in NLP datasets.

In [36]:
features = FW.FeatureWiz(corr_limit=0.70, feature_engg='', category_encoders='', dask_xgboost_flag=False, nrows=None, verbose=2)
X_train_selected = features.fit_transform(X_train_trans, y_train_trans)
X_test_selected = features.transform(X_test_trans)

In [37]:
select = features.features  ### provides the list of selected features ###
print('Select features = %d' %len(select))
print(select)

In [38]:
sim = LazyTransformer(model=lgb, encoders='auto',scalers='max', imbalanced=False)
sim

In [39]:
### If using a model in pipeline, use fit and predict only ###
sim.fit(X_train_selected, y_train)

In [40]:
### If using a model in pipeline, use fit and predict only ###
predictions = sim.predict(X_test_selected)
predictions

In [42]:
y_valid[:3]

In [43]:
import sklearn
if modeltype == 'Regression':
    print('R-squared = %0.0f%%' %(100*sklearn.metrics.r2_score(y_valid, predictions)))
    print('RMSE = %0.2f' %np.sqrt(sklearn.metrics.mean_squared_error(y_valid, predictions)))
else:
    #dicto = {0:'f',1:'t'}
    dicto = {0:0,1:1}
    print(sklearn.metrics.classification_report(y_valid, pd.Series(predictions).map(dicto)))

# So it is actually better to select features than to use all features. The performance is about the same but features are fewer.

In [44]:
sim = LazyTransformer(model=lgb, encoders='auto',scalers='max', imbalanced=False)
sim

In [46]:
### If using a model in pipeline, use fit and predict only ###
sim.fit(df[preds], df[target])

In [47]:
### If using a model in pipeline, use fit and predict only ###
predictions = sim.predict(test[preds])
predictions.shape

In [48]:
subm = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')
print(subm.shape)
subm.head()

In [52]:
subm[target[0]] = predictions
subm.head()

In [51]:
subm.to_csv('submission.csv', index=False)