In [25]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [26]:
def merge_by(ind, train, test, new):
    train = pd.merge(train, new, how='left', on=ind)
    test = pd.merge(test, new, how='left', on=ind)
    return train, test

### Loading up base templates for train and test

In [27]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("sample_submission_zero.csv")

### Members => train, test

In [28]:
df_members = pd.read_csv("members.csv")

In [29]:
df_members['registration_init_year'] = df_members['registration_init_time'].apply(lambda x: int(str(x)[:4]))
df_members['registration_init_month'] = df_members['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
df_members['registration_init_date'] = df_members['registration_init_time'].apply(lambda x: int(str(x)[-2:]))

df_members['expiration_date_year'] = df_members['expiration_date'].apply(lambda x: int(str(x)[:4]))
df_members['expiration_date_month'] = df_members['expiration_date'].apply(lambda x: int(str(x)[4:6]))
df_members['expiration_date_date'] = df_members['expiration_date'].apply(lambda x: int(str(x)[-2:]))

df_train, df_test = merge_by("msno", df_train, df_test, df_members)

gender = {'male':1, 'female':2}
df_train['gender'] = df_train['gender'].map(gender)
df_test['gender'] = df_test['gender'].map(gender)

### Transactions => train, test

In [30]:
df_transactions = pd.read_csv("transactions.csv")

# Count of transactions for a certain user
trans_count = pd.DataFrame(df_transactions['msno'].value_counts().reset_index())
trans_count.columns = ['msno','trans_count']

df_train, df_test = merge_by("msno", df_train, df_test, trans_count)

# Info from latest transaction for user
trans_latest = df_transactions.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True)
trans_latest = df_transactions.drop_duplicates(subset=['msno'], keep='first')

df_train, df_test = merge_by("msno", df_train, df_test, trans_latest)

In [16]:
df_userlogs_chunks = pd.read_csv("user_logs.csv", chunksize = 2000000)

In [None]:
df_userlogs = pd.concat(df_userlogs, ignore_index=True)

In [31]:
df_train = df_train.fillna(-999)
df_test = df_test.fillna(-999)

print(df_train.head())

                                           msno  is_churn  city    bd  gender  \
0  waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=         1  18.0  36.0     2.0   
1  QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=         1  10.0  38.0     1.0   
2  fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=         1  11.0  27.0     2.0   
3  mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=         1  13.0  23.0     2.0   
4  XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=         1   3.0  27.0     1.0   

   registered_via  registration_init_time  expiration_date  \
0             9.0              20050406.0       20170907.0   
1             9.0              20050407.0       20170321.0   
2             9.0              20051016.0       20170203.0   
3             9.0              20051102.0       20170926.0   
4             9.0              20051228.0       20170927.0   

   registration_init_year  registration_init_month    ...      \
0                  2005.0                      4.0    ...       
1         

In [32]:
df_train.to_csv("_train.csv", index = False)
df_test.to_csv("_test.csv", index = False)