In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor

from util_data import DataSet
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

In [None]:
deleted_features = ["membership_expire_date_last","transaction_date_last","payment_plan_days_mean","payment_plan_days_last","num_100_avg_1mo","num_100_avg_3mo","num_100_avg_6mo"]

In [None]:
data = DataSet()

train = data.get_training_set().append(data.get_validation_set())
test = data.get_testing_set()

In [None]:
train["payment_method_id_lambda"]=train["payment_method_id_<lambda>"]
test["payment_method_id_lambda"]=test["payment_method_id_<lambda>"]
train=train.drop(["payment_method_id_<lambda>"],axis=1)
test=test.drop(["payment_method_id_<lambda>"],axis=1)

In [None]:
train["TimeSinceReg"] = train["TimeSinceReg"].replace("nan","0")
test["TimeSinceReg"] = test["TimeSinceReg"].replace("nan","0")

train["TimeSinceReg"] = train["TimeSinceReg"].apply(lambda chaine : int(chaine.split(" ")[0]))
test["TimeSinceReg"] = test["TimeSinceReg"].apply(lambda chaine : int(chaine.split(" ")[0]))
                                                  
test= test.drop(["date_avg_6mo", "date_avg_1mo"],axis=1)
#train= train.drop(["date_avg_6mo", "date_avg_1mo"],axis=1)
                                                  
cols = train.columns.tolist()
cols.remove("is_churn")
test = test[cols]

train = train.fillna(0)
test = test.fillna(0)

In [None]:
med_age = np.median(train.loc[train["bd"]>13].loc[train["bd"]<80]["bd"])
def age_correction(x):
    if(x<13):
        return(med_age)
    if(x>80):
        return(med_age)
    return(x)
train["bd"] = train["bd"].apply(age_correction)
test["bd"] = test["bd"].apply(age_correction)

In [None]:
combs = [
    ("payment_method_id_lambda","payment_method_id_last"),
    ("actual_amount_paid_last","actual_amount_paid_mean"),
    ("num_25_avg_6mo", "num_25_avg_3mo"),
    ("num_50_avg_6mo", "num_50_avg_3mo"),
    ("num_985_avg_6mo", "num_985_avg_3mo"),
    ("num_unq_avg_6mo", "num_unq_avg_3mo"),
    ("count_6mo", "count_3mo"),
    ("total_secs_avg_6mo", "total_secs_avg_3mo"),
    ("plan_list_price_mean", "plan_list_price_last"),
    ("num_unq_avg_1mo","total_secs_avg_1mo"),
    ("is_auto_renew_median","is_auto_renew_last")

]

for f1, f2 in combs:
    name1 = f1 + "_plus_" + f2
    train[name1] = train[f1].apply(lambda x: str(x)) + "_" + train[f2].apply(lambda x: str(x))
    test[name1] = test[f1].apply(lambda x: str(x)) + "_" + test[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train[name1].values) + list(test[name1].values))
    train[name1] = lbl.transform(list(train[name1].values))
    test[name1] = lbl.transform(list(test[name1].values))
    train = train.drop([f1,f2],axis=1)
    test = test.drop([f1,f2],axis=1)

In [None]:
train = train.drop(deleted_features,axis=1)
test = test.drop(deleted_features,axis=1)
test = test.drop(train.filter(like="num_75"),axis=1)
train = train.drop(train.filter(like="num_75"),axis=1)

In [None]:
# Seaborn style
sns.set_style("whitegrid")

# Getting correlation matrix
cor_matrix = train.corr().round(2)

# Plotting heatmap 
fig = plt.figure(figsize=(20,20));
sns.heatmap(cor_matrix, annot=True, center=0, cmap = sns.diverging_palette(250, 10, as_cmap=True), ax=plt.subplot(111));
plt.show()

In [None]:
train.to_csv("data/better_train.csv",index=False)
test.to_csv("data/better_test.csv",index=False)