# CREATE TRAIN DATASET

In [None]:
import pandas as pd 
import numpy as np

In [None]:

#  LOAD 10 FIRST MONTH 


month1 = pd.read_csv('data/batch/2015-01-28.csv')
month2 = pd.read_csv('data/batch/2015-02-28.csv')
month3 = pd.read_csv('data/batch/2015-03-28.csv')
month4 = pd.read_csv('data/batch/2015-04-28.csv')
month5 = pd.read_csv('data/batch/2015-05-28.csv')
month6 = pd.read_csv('data/batch/2015-06-28.csv')
month7 = pd.read_csv('data/batch/2015-07-28.csv')
month8 = pd.read_csv('data/batch/2015-08-28.csv')
month9 = pd.read_csv('data/batch/2015-09-28.csv')
month10 = pd.read_csv('data/batch/2015-10-28.csv')


## CUT OFF DATE 7/2015

- Customer do not have the account (ex: saving_account = 0 ) at cut off date 
- Label_1 : convert from 0 -> 1 in next 3 months (8 9 10/2025) 
- Label_0 : 0 -> 0 in next 3 months 


- Concat 7 months and set account holding at 7/2015 

In [None]:
month1.info()

In [None]:
df = pd.concat([month1, month2, month3, month4, month5, month6, month7])

account_columns = [
    'savings_account', 'guarantees', 'current_accounts', 'deriv_investments',
    'payroll_accounts', 'junior_accounts', 'more_particular_accounts', 
    'particular_accounts', 'particular_plus_accounts', 'short_term_deposits',
    'medium_term_deposits', 'long_term_deposits', 'e_account', 'funds',
    'mortgage', 'pensions', 'loans', 'taxes', 'credit_card', 'securities',
    'home_account', 'payroll', 'pensions_2', 'direct_debit'
]

df_no_accounts = df.drop(columns=account_columns)

month7_accounts = month7[['customer_id'] + account_columns]
df_final = df_no_accounts.merge(month7_accounts, on='customer_id', how='left')



## Create labels 

From cut-off date looking in the future 3 months (8 9 10) 

- Label_1: 0->1 
- Label_0: 0->0

In [None]:
# merge month 8 9 10
future_months = pd.concat([month8, month9, month10])

# Group by customer và check nếu có account = 1 trong 3 tháng
labels = future_months.groupby('customer_id')[account_columns].max().reset_index()

label_columns = {col: f'{col}_label' for col in account_columns}
labels = labels.rename(columns=label_columns)

print(f"Shape của future_months: {future_months.shape}")
print(f"Shape của labels: {labels.shape}")
print(f"unique In labels: {labels['customer_id'].nunique()}")

labels.head()

In [None]:
labels

In [None]:
#  free memory month 1 -> 10 
del month1, month2, month3, month4, month5, month6, month7
del month8, month9, month10
del df, df_no_accounts, month7_accounts, future_months


In [None]:
#  logic labels 
def create_final_labels(df_final, labels, account_columns):
    merged = df_final.merge(labels, on='customer_id', how='left')

    label_cols_to_drop = []

    for account in account_columns:
        label_col = f'{account}_label'
        final_label_col = f'{account}_final_label'
        label_cols_to_drop.append(label_col)

        merged[final_label_col] = np.zeros(len(merged), dtype=np.int8)

        mask_has_july = merged[account] == 1
        merged.loc[mask_has_july, final_label_col] = -1

        mask_convert = (~mask_has_july) & (merged[label_col] == 1)
        merged.loc[mask_convert, final_label_col] = 1

    merged.drop(columns=account_columns, inplace=True)

    merged.drop(columns=label_cols_to_drop, inplace=True)

    return merged


final_dataset = create_final_labels(df_final, labels, account_columns)

print("Ví dụ cho savings_account:")
print("Có account tại tháng 7 (label=-1):", (final_dataset['savings_account_final_label'] == -1).sum())
print("Convert 0->1 (label=1):", (final_dataset['savings_account_final_label'] == 1).sum()) 
print("Không convert 0->0 (label=0):", (final_dataset['savings_account_final_label'] == 0).sum())

print(f"\nShape của final_dataset: {final_dataset.shape}")
final_dataset.head(10)


In [None]:
final_dataset.info()

In [None]:
# save dataset
final_dataset.to_csv('data/processed/train.csv', index=False)


In [None]:

import gc
import psutil
import os

try:
    del month1, month2, month3, month4, month5, month6, month7
    del month8, month9, month10
    del df, df_no_accounts, month7_accounts, future_months
    del df_final, labels, final_dataset
    del account_columns, label_columns
    print("Deleted all major variables")
except:
    print("Some variables already deleted")

for i in range(3):
    collected = gc.collect()
    print(f"Garbage collection {i+1}: {collected} objects collected")



process = psutil.Process(os.getpid())
memory_mb = process.memory_info().rss / 1024 / 1024
print(f"Current memory usage: {memory_mb:.2f} MB")

print("Memory cleanup completed!")

# Create Test set

In [None]:
def create_dataset(cutoff_month, cutoff_year=2015, training_months=7, future_months_count=3, dataset_name="dataset"):
    """
    Create dataset with cutoff date and future months for labeling
    
    Parameters:
    - cutoff_month: int (1-12), cutoff month 
    - cutoff_year: int, cutoff year (2015, 2016, etc.)
    - training_months: int, number of months before cutoff for training
    - future_months_count: int, number of future months to create labels
    - dataset_name: str, dataset name to save
    
    Returns:
    - final_dataset: DataFrame
    """
    
    # Define account columns
    account_columns = [
        'savings_account', 'guarantees', 'current_accounts', 'deriv_investments',
        'payroll_accounts', 'junior_accounts', 'more_particular_accounts', 
        'particular_accounts', 'particular_plus_accounts', 'short_term_deposits',
        'medium_term_deposits', 'long_term_deposits', 'e_account', 'funds',
        'mortgage', 'pensions', 'loans', 'taxes', 'credit_card', 'securities',
        'home_account', 'payroll', 'pensions_2', 'direct_debit'
    ]
    
    print(f"Creating {dataset_name} with cutoff {cutoff_month}/{cutoff_year}")
    print(f"Training period: {training_months} months before cutoff")
    
    # 1. Calculate training months with proper year handling
    months_data = []
    
    # Calculate start month and year
    total_months_from_start = (cutoff_year - 2015) * 12 + cutoff_month
    start_total_months = total_months_from_start - training_months + 1
    
    for i in range(training_months):
        current_total_months = start_total_months + i
        current_year = 2015 + (current_total_months - 1) // 12
        current_month = ((current_total_months - 1) % 12) + 1
        
        month_file = f'data/batch/{current_year}-{current_month:02d}-28.csv'
        month_df = pd.read_csv(month_file)
        months_data.append(month_df)
        print(f"Loaded training month {current_month}/{current_year}")
    
    # 2. Concatenate training months
    df = pd.concat(months_data, ignore_index=True)
    
    # 3. Load cutoff month data for account holdings
    cutoff_file = f'data/batch/{cutoff_year}-{cutoff_month:02d}-28.csv'
    cutoff_month_data = pd.read_csv(cutoff_file)
    print(f"Loaded cutoff month {cutoff_month}/{cutoff_year}")
    
    df_no_accounts = df.drop(columns=account_columns)
    cutoff_accounts = cutoff_month_data[['customer_id'] + account_columns]
    df_final = df_no_accounts.merge(cutoff_accounts, on='customer_id', how='left')
    
    print(f"Training data shape: {df_final.shape}")
    
    # 4. Load future months to create labels
    future_data = []
    cutoff_total_months = (cutoff_year - 2015) * 12 + cutoff_month
    
    for i in range(1, future_months_count + 1):
        future_total_months = cutoff_total_months + i
        future_year = 2015 + (future_total_months - 1) // 12
        future_month = ((future_total_months - 1) % 12) + 1
        
        future_file = f'data/batch/{future_year}-{future_month:02d}-28.csv'
        future_month_df = pd.read_csv(future_file)
        future_data.append(future_month_df)
        print(f"Loaded future month {future_month}/{future_year}")
    
    # 5. Create labels from future months
    future_months = pd.concat(future_data, ignore_index=True)
    labels = future_months.groupby('customer_id')[account_columns].max().reset_index()
    label_columns = {col: f'{col}_label' for col in account_columns}
    labels = labels.rename(columns=label_columns)
    
    print(f"Labels shape: {labels.shape}")
    
    # 6. Create final labels with conversion logic
    def create_final_labels(df_final, labels, account_columns):
        merged = df_final.merge(labels, on='customer_id', how='left')
        
        for account in account_columns:
            label_col = f'{account}_label'
            final_label_col = f'{account}_final_label'
            
            merged[final_label_col] = np.zeros(len(merged), dtype=np.int8)
            
            # Already has account at cutoff month -> label = -1
            mask_has_cutoff = merged[account] == 1
            merged.loc[mask_has_cutoff, final_label_col] = -1
            
            # Convert 0->1 in future months -> label = 1
            mask_convert = (~mask_has_cutoff) & (merged[label_col] == 1)
            merged.loc[mask_convert, final_label_col] = 1
        
        # Drop original columns
        merged.drop(columns=account_columns, inplace=True)
        merged.drop(columns=[f'{col}_label' for col in account_columns], inplace=True)
        
        return merged
    
    final_dataset = create_final_labels(df_final, labels, account_columns)
    
    # 7. Print statistics
    print(f"\n{dataset_name} Statistics:")
    print(f"Final dataset shape: {final_dataset.shape}")
    print("Example for savings_account:")
    print(f"Already has account at cutoff (label=-1): {(final_dataset['savings_account_final_label'] == -1).sum()}")
    print(f"Convert 0->1 (label=1): {(final_dataset['savings_account_final_label'] == 1).sum()}")
    print(f"No conversion 0->0 (label=0): {(final_dataset['savings_account_final_label'] == 0).sum()}")
    
    # 8. Save dataset
    final_dataset.to_csv(f'data/processed/{dataset_name}.csv', index=False)
    print(f"Dataset saved to data/processed/{dataset_name}.csv")

    import gc
    # 9. Memory cleanup
    del df, df_no_accounts, cutoff_accounts, df_final
    del future_data, future_months, labels
    gc.collect()
    
    return final_dataset


In [None]:


test1 = create_dataset(         cutoff_month=8, 
                        cutoff_year=2015, 
                        training_months=7, 
                        future_months_count=3, 
                        dataset_name="test_8_2015") 



In [None]:
test1 

In [None]:
create_dataset( cutoff_month=9, cutoff_year=2015,training_months=7,
               future_months_count=3,  dataset_name="test_9_2015") 

create_dataset( cutoff_month=10, cutoff_year=2015,training_months=7,
               future_months_count=3,  dataset_name="test_10_2015")

create_dataset( cutoff_month=11, cutoff_year=2015,training_months=7,
               future_months_count=3,  dataset_name="test_11_2015")

create_dataset( cutoff_month=12, cutoff_year=2015,training_months=7,
               future_months_count=3,  dataset_name="test_12_2015")

create_dataset( cutoff_month=1, cutoff_year=2016,training_months=7,
               future_months_count=3,  dataset_name="test_1_2016")

create_dataset( cutoff_month=2, cutoff_year=2016,training_months=7,
               future_months_count=3,  dataset_name="test_2_2016")

