In [None]:
import pandas as pd
from pathlib import Path

In [None]:
from fastai.tabular.all import * 
from fastai.test_utils import show_install
from IPython.display import display, clear_output
import holidays
import seaborn as sns

show_install()


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


In [None]:
def set_seed_value(seed=718):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed_value()


In [None]:
path = Path('../input/tabular-playground-series-may-2022/')
Path.BASE_PATH = path
path.ls()


In [None]:
train_df = pd.read_csv(os.path.join(path, 'train.csv')).set_index('id')
test_df = pd.read_csv(os.path.join(path, 'test.csv')).set_index('id')
sample_submission = pd.read_csv(os.path.join(path, 'sample_submission.csv'))

string_var = 'f_27'
dep_var = 'target'


In [None]:
train_df.isna().sum().sum(), test_df.isna().sum().sum(), train_df.isnull().sum().sum(), test_df.isnull().sum().sum()


In [None]:
train_df.head()


In [None]:
train_df.hist(column=dep_var)


In [None]:
corr = train_df.corr()

fig, axes = plt.subplots(figsize=(30, 15))
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask=mask, linewidths=.5, annot=True, cmap='rainbow')

plt.show()


In [None]:
train_df.info()


In [None]:
cat_columns = train_df.columns[(train_df.dtypes.values != np.dtype('float64'))]
cat_columns


In [None]:
for col in cat_columns:
    print('column ', col, ' number of unique values ', train_df[col].nunique())


In [None]:
train_df[string_var].str.len().min(), train_df[string_var].str.len().max(), test_df[string_var].str.len().min(), test_df[string_var].str.len().max()


In [None]:
def convert_feature_27(df, do_convert = True):
    if do_convert:
        for i in range(10):
            df[f'f_27_{i}'] = df[string_var].str.get(i)
    df.drop([string_var], axis=1, inplace=True)
    return df


In [None]:
train_df = convert_feature_27(train_df, do_convert=False)
test_df = convert_feature_27(test_df, do_convert=False)


In [None]:
train_df.shape, test_df.shape


In [None]:
train_df.head()


In [None]:
cont_vars, cat_vars = cont_cat_split(train_df, dep_var=dep_var, max_card=25)
len(cont_vars), len(cat_vars),cont_vars,cat_vars


In [None]:
def getData(df, batchSize=1024):
    
    to_train = TabularPandas(df, 
                           [Normalize, Categorify, FillMissing],
                           cat_names=cat_vars,
                           cont_names=cont_vars, 
                           splits=RandomSplitter(valid_pct=0.2)(df),  
                           device = device,
                           y_block=CategoryBlock(),
                           y_names=dep_var) 

    return to_train.dataloaders(bs=batchSize)


In [None]:
dls = getData(train_df, batchSize=2048)
len(dls.train), len(dls.valid)


In [None]:
dls.show_batch()


In [None]:
my_config = tabular_config(y_range=(0,1) )
learn = tabular_learner(dls,
                       config = my_config,
                       metrics=[accuracy])

learn.summary()


In [None]:
learn.lr_find()


In [None]:
learn.fit_one_cycle(50, 3e-3, cbs=SaveModelCallback(fname='kaggle_tps2022_may', with_opt=True))


In [None]:
learn.show_results()


In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix(normalize=True, norm_dec=3)


In [None]:
learn.load('kaggle_tps2022_may')


In [None]:
dlt = learn.dls.test_dl(test_df, bs=1024) 
nn_preds,_ ,preds = learn.get_preds(dl=dlt , with_decoded=True) 

nn_preds, preds


In [None]:
sample_submission[dep_var] = np.argmax(nn_preds, axis=-1)
sample_submission.to_csv("submission.csv", index=False)
sample_submission.head(10)


In [None]:
sample_submission.hist(column=dep_var)


In [None]:
!ls -al
