In [None]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import warnings
import time
from datetime import datetime, timedelta
from scipy.stats import mode
warnings.filterwarnings('ignore')

## Logs

In [None]:
train = pd.read_csv('data/test.csv')

In [None]:
logs_iter = pd.read_csv('data/user_logs.csv', low_memory=False, iterator=True, chunksize=1000000, parse_dates=["date"], infer_datetime_format = True)
end = "20170401"
outputs = [pd.DataFrame(),pd.DataFrame(),pd.DataFrame()]
ref = ['20170301','20170101','20161001']

for logs in tqdm(logs_iter):
    sel_logs = logs.merge(train, on='msno', how='right').drop(["is_churn"],axis=1)
    for i in range(3):
        new_logs = sel_logs.loc[sel_logs["date"] >= pd.to_datetime(ref[i], format='%Y%m%d', errors='ignore')]
        new_logs = new_logs.loc[new_logs["date"] <= pd.to_datetime(end, format='%Y%m%d', errors='ignore')]
        new_logs = new_logs.groupby(by=["msno"]).agg(["sum","count"]).reset_index()
        outputs[i] = outputs[i].append(new_logs)
        
logs_iter = pd.read_csv('data/user_logs_v2.csv', low_memory=False, iterator=True, chunksize=1000000, parse_dates=["date"], infer_datetime_format = True)
for logs in tqdm(logs_iter):
    sel_logs = logs.merge(train, on='msno', how='right').drop(["is_churn"],axis=1)
    for i in range(3):
        new_logs = sel_logs.loc[sel_logs["date"] >= pd.to_datetime(ref[i], format='%Y%m%d', errors='ignore')]
        new_logs = new_logs.loc[new_logs["date"] <= pd.to_datetime(end, format='%Y%m%d', errors='ignore')]
        new_logs = new_logs.groupby(by=["msno"]).agg(["sum","count"]).reset_index()
        outputs[i] = outputs[i].append(new_logs)

In [None]:
labels=["1mo","3mo","6mo"]

for k in range(3):
    outputs[k] = outputs[k].groupby(by=["msno"]).agg("sum")
    old_cols = outputs[k].columns

    for i,lab in zip(old_cols.get_level_values(0),labels):
        outputs[k][i+"_avg_"+lab] = (outputs[k])[i]["sum"]["sum"]/(outputs[k])[i]["count"]["sum"]
        outputs[k][i+"_count_"+lab] = (outputs[k])[i]["count"]["sum"]
    outputs[k].drop(old_cols,axis=1)

In [None]:
tr_1mo = outputs[0]
tr_3mo = outputs[1]
tr_6mo = outputs[2]

union = tr_1mo.merge(tr_3mo,how="outer",on="msno")
union = union.merge(tr_6mo,how="outer",on="msno")

union.fillna(0)
union.to_csv("data/tst_logs.csv",index=False)

## Transactions

In [None]:
df_train = pd.read_csv('data/test.csv', usecols=['msno'])

In [None]:
df_iter = pd.read_csv('data/transactions.csv', low_memory=False, iterator=True, chunksize=10000000)

df_transactions = pd.DataFrame()

for df in df_iter:
    new_df = pd.merge(df, df_train, on='msno', how='right')
    if df_transactions.empty:
        df_transactions = new_df
    else:
        df_transactions = pd.concat([df_transactions, new_df], ignore_index=True)
        
df_iter = pd.read_csv('data/transactions_v2.csv', low_memory=False, iterator=True, chunksize=10000000)

for df in df_iter:
    new_df = pd.merge(df, df_train, on='msno', how='right')
    if df_transactions.empty:
        df_transactions = new_df
    else:
        df_transactions = pd.concat([df_transactions, new_df], ignore_index=True)

In [None]:
df_transactions = df_transactions.sort_values('transaction_date')
new_df_transactions_v1 = df_transactions.query("transaction_date <= 20170131")

def last(ser):
    if len(ser)>0:
        return ser.iloc[-1]
    else:
        return mean

new_df_transactions_v1 = new_df_transactions_v1.groupby('msno').agg({'payment_method_id':[lambda l : mode(l)[0][0], last],
       'payment_plan_days':[np.mean, last], 'plan_list_price':[np.mean, last], 'actual_amount_paid':[np.mean, last],
       'is_auto_renew':[np.median, last],'is_cancel': [sum, last], 'transaction_date':[last], 'membership_expire_date':[last]})

In [None]:
new_df_transactions_v1.to_csv("input/transactions_test.csv")

## Members

In [None]:
train = pd.read_csv('data/test.csv', usecols=['msno'])
members = pd.read_csv('data/members_v3.csv')
train = pd.merge(train, members, how='left', on='msno')


def purifieGender(df):
    gender = {'male':1, 'female':2}
    df['gender'] = df['gender'].map(gender)
    (df.gender).fillna(0, inplace = True)

purifieGender(train)

train["dateChurn"] = datetime(2017, 2, 28)
train['registration_init_time'] = pd.to_datetime(train['registration_init_time'], errors = 'coerce', format = '%Y%m%d')
train["TimeSinceReg"] = train["dateChurn"] - train["registration_init_time"]

train.drop(['registration_init_time', "dateChurn"], inplace=True, axis=1)
train = train.dropna()

duree = ["TimeSinceReg"]

def splitDate(t):
    if isinstance(str(t),float):
        return t
    else:
        return int(str(t).split(" ")[0])
        
for d in duree:
    train[d] = train[d].apply(splitDate)

train.to_csv('data/member_test.csv')

## Merge

In [None]:
tr_1mo = pd.read_csv("data/wasted/member_test.csv")
tr_3mo = pd.read_csv("data/wasted/transactions_test.csv",parse_dates=["membership_expire_date_last", "transaction_date_last"], infer_datetime_format = True)
tr_6mo = pd.read_csv("data/wasted/tst_logs.csv")

union = tr_1mo.merge(tr_3mo,how="outer",on="msno")
union = union.merge(tr_6mo,how="outer",on="msno")

union = union.drop(union.filter(like="Unnamed"),axis=1)
union.fillna(0)
union.to_csv("data/testing_set.csv")