## Problem Statement
When the performance of your model is not good, revisit your data. This is what we will do in this notebook. 
We will have a look at our data and try to engineer some features using which the model can use to improve its performance and stability.


In [None]:
# function
import os
import sys

new_directory = "E:/airflow/airflow"
current_directory = os.getcwd()
scripts_path = os.path.abspath(os.path.join(os.getcwd(), '../scripts'))

def change_directory(current_directory, new_directory,scripts_path):
    # Get the current working directory
    print(f'Current directory: {current_directory}')
    # Define the path to change to
  
    try:
        # Change the current working directory
        os.chdir(new_directory)
        # Verify the change
        current_directory = os.getcwd()
        print(f'Current directory changed to: {current_directory}')
    except FileNotFoundError:
        print(f'Error: The directory "{new_directory}" does not exist.')
    except PermissionError:
        print(f'Error: Permission denied to change to "{new_directory}".')
    except Exception as e:
        print(f'An unexpected error occurred: {e}')
    # Add the scripts directory to the Python path
    sys.path.append(scripts_path)

    
change_directory(current_directory, new_directory, scripts_path)

In [None]:
# Setting up all directory
root_folder = new_directory
database_path = root_folder+"/database/"
data_directory = root_folder+"/data/raw/"
data_profile_path = root_folder+"/data/profile_report/"
intermediate_data_path = root_folder+"/data/interim/"
final_processed_data_path = root_folder+"/data/processed/"

old_data_directory = root_folder+"/data/raw/"
new_data_directory = root_folder+"/data/new/"
intermediate_path = root_folder+"/data/interim/"


# Database
db_path = root_folder+"/database/"
db_file_name = "feature_store_v01.db"
drfit_db_name = "drift_db_name.db"
date_columns = ['registration_init_time','transaction_date_min','transaction_date_max','membership_expire_date_max','last_login']
drift_db_name = "drift_db_name.db"

# Mlflow
mlflow_tracking_uri = "http://Localhost:6006"
ml_flow_model_path = root_folder+ "/mlruns/2/cb66e22bcbf74ded99dc219eb29e7609/artifacts/models/"
ml_flow_path = root_folder+ "/mlruns/2/cb66e22bcbf74ded99dc219eb29e7609"

run_on = "old" #"old"
append=False
date_transformation = False
start_date = '2017-03-01'
end_date = '2017-03-31'
mlflow_experiment_name = "Model_Building_Pipeline_Drift"

### 1.1 Importing packages

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Imported Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scripts.utils import *
from pycaret.classification import *
# Other Libraries
import mlflow

 ### 1.2 Reading Data

We will be using the raw data for our analysis instead of the sampled one so that we can better judge the features that we create. But before moving on with our analysis it is advised to revisit that you revisit the EDA that we performed previously.
 
* Recall that we had 4 categories of data, User Profile data, user logs, transactions, and historic data.
* Here we will try to create features using that better represent the user’s engagement and the transaction that the user made.
* But before that let’s load and clean the raw data.
* Recall that during our preliminary analysis we found that the merging of the data was simply done as common aggregation. This needs to be improved primarly.

In [None]:
%%time

#Reading the data
#data pipeline 

members, user_logs, transactions, train  = load_data( [
                                                            f"{data_directory}members_profile.csv",
                                                            f"{data_directory}userlogs.csv",
                                                            f"{data_directory}transactions_logs.csv",
                                                            f"{data_directory}churn_logs.csv"
                                                            ]
                                                          )

In [None]:
print(members.shape)
print(transactions.shape)
print(user_logs.shape)
print(train.shape)

 ### 1.3 Data cleaning
    
Converting the columns to date-time column

In [None]:
%%time
members_c, transactions_c, user_logs_c = compress_dataframes([members, transactions, user_logs])
members = members_c[0]

transactions = transactions_c[0]
user_logs = user_logs_c[0]

In [None]:
print("members DF before compress was in MB ,",members_c[1], "and after compress , ", members_c[2])
print("transactions DF before compress was in MB ,",transactions_c[1], "and after compress , ", transactions_c[2])
print("user_logs DF before compress was in MB ,",user_logs_c[1], "and after compress , ", user_logs_c[2])

### 1.4 Data pre-processing

##### 1.4.1 Members data

In [None]:
members.head()

In [None]:
# #this function is also available in utils.py
# def get_label_encoding_dataframe(dataframe, column_name, mapping_dict):
#     return dataframe[column_name].map(mapping_dict) 
# # #average_age if (x <=0 or x >100) else x
# def get_apply_condiiton_on_column(dataframe, column_name, condition):
#     return dataframe[column_name].apply(lambda x :eval(condition))

In [None]:
%%time

#Replacing missing values in gender
members['gender'] = get_fill_na_dataframe(members, 'gender', value="others")

gender_mapping = {'male':0,'female':1,'others':2}
members['gender'] = get_label_encoding_dataframe(members, 'gender',gender_mapping)


members['registered_via'] = get_convert_column_dtype(members, 'registered_via', data_type='str')
members['city'] = get_convert_column_dtype(members, 'city', data_type='str')
members['registration_init_time'] = fix_time_in_df(members, 'registration_init_time', expand=False)

average_age = round(members['bd'].mean(),2)
condition = f"{average_age} if (x <=0 or x >100) else x"
members['bd'] = get_apply_condiiton_on_column(members, 'bd', condition)

members.head()

In [None]:
# observing the distribution of columns
get_data_describe(members)

##### 1.4.2 Transactions data

In [None]:
%%time
#date conversion

transactions['transaction_date'] = fix_time_in_df(transactions, 'transaction_date', expand=False)
transactions['membership_expire_date'] = fix_time_in_df(transactions, 'membership_expire_date', expand=False)
transactions.head()

### 2 Feature Engineering

#### 2.1 Generating features from transactions data


* **is_discount**
Recall that in our dataset there are 2 columns named “plan_list_price” and “actual_amount_paid”. From here we can figure out if a user bought the plan at a discounted price or not by checking whether the amount paid by the user is smaller than the actual plan’s price or not. This feature is stored in “is_discount” where
	1 represents that the plan was bought at a discounted price
	0 represents that the plan was bought at the original price
We will also store the discount that the user received in “discount”
 
* **amt_per_day**
We will now create a feature that calculates the per-day cost of a user’s subscription. It is expected that if the per-day cost of the subscription is high then the propensity of the user to churn increases. We will store this information in a column called “amt_per_day”.
 
* **membership_duration**
We also expect the older customer to have a lower probability to churn, thus we will create a feature “membership_duration” which will hold the number of months that the user has been a member of our platform.
 
After creating and storing the above-mentioned features in “transactions.csv” we will generate a profile report for the same


In [None]:
#these functions are also present in utils.py
# def get_two_column_operations(dataframe, columns_1, columns_2, operator):
#     if operator == "+":
#         return dataframe[columns_1]+dataframe[columns_2]
#     elif operator == "-":
#         return dataframe[columns_1]-dataframe[columns_2]
#     elif operator == "/":
#         return dataframe[columns_1]/dataframe[columns_2]
#     elif operator == "*":
#         return dataframe[columns_1]*dataframe[columns_2]
    
# def get_timedelta_division(dataframe, column, td_type='D'):
#     return dataframe[column] /np.timedelta64(1,td_type)

# def get_replace_value_in_df(dataframe, column, value, replace_with):
#     return dataframe[column].replace(value,replace_with) 

In [None]:
%%time

transactions['discount'] =  get_two_column_operations(transactions, 'plan_list_price', 'actual_amount_paid', "-")

condition = f"1 if x > 0 else 0"
transactions['is_discount'] = get_apply_condiiton_on_column(transactions, 'discount', condition)


transactions['amt_per_day'] = get_two_column_operations(transactions, 'actual_amount_paid', 'payment_plan_days', "/")
transactions['amt_per_day'] = get_replace_value_in_df(transactions, 'amt_per_day', [np.inf, -np.inf], replace_with=0)


transactions['membership_duration'] = get_two_column_operations(transactions, 'membership_expire_date', 'transaction_date', "-")
transactions['membership_duration'] = get_timedelta_division(transactions, "membership_duration", td_type='D')
transactions['membership_duration'] = get_convert_column_dtype(transactions, 'membership_duration', data_type='int')

condition = f"1 if x>30 else 0"
transactions['more_than_30'] = get_apply_condiiton_on_column(transactions, 'membership_duration', condition)

In [None]:
transactions.head()

We will apply different aggregation techniques on each column to derive additional features to map the relationship between independent and dependent vairables better.

In [None]:
agg = {'payment_method_id':['count','nunique'], # How many transactions user had done in past, captures if payment method is changed
       'payment_plan_days':['mean', 'nunique'] , #Average plan of customer in days, captures how many times plan is changed
       'plan_list_price':'mean', # Average amount charged on user
       'actual_amount_paid':'mean', # Average amount paid by user
       'is_auto_renew':['mean','max'], # Captures if user changed its auto_renew state
       'transaction_date':['min','max','count'], # First and the last transaction of a user
       'membership_expire_date':'max' , # Membership exipry date of the user's last subscription
       'is_cancel':['mean','max'], # Captures the average value of is_cancel and to check if user changed its is_cancel state
       'discount' : 'mean', # Average discount given to customer
       'is_discount':['mean','max'], # Captures the average value of is_discount and to check if user was given any discount in the past
       'amt_per_day' : 'mean', # Average amount a user spends per day
       'membership_duration' : 'mean' ,# Average membership duration 
       'more_than_30' : 'sum' #Flags if the difference in days if more than 30
        }

In [None]:
transactions_features = get_groupby(transactions, by_column='msno', agg_dict=agg, agg_func = 'mean', simple_agg_flag=False, reset_index=True)
transactions_features.columns= transactions_features.columns.get_level_values(0)+'_'+transactions_features.columns.get_level_values(1)
transactions_features.rename(columns = {'msno_':'msno','payment_plan_days_nunique':'change_in_plan', 'payment_method_id_count':'total_payment_channels',
                                        'payment_method_id_nunique':'change_in_payment_methods','is_cancel_max':'is_cancel_change_flag',
                                        'is_auto_renew_max':'is_autorenew_change_flag','transaction_date_count':'total_transactions'}, inplace = True)
transactions_features.head()

In [None]:
transactions_features.shape

#### 2.2 Generating features from user profiles

Here we will engineer features that will better represent a user’s behavior. We will try to measure the users engagement with the platform

* **login_frequency**
A decent way to quantize a user’s engagement will be to simply check the number of times the user has used the platform in a given period of time. We create this feature and store this in “login_frequency”. We expect that a user who is engaged with the platform will have less propensity to churn.
 
* **last_login**
A user who is not active recently has more propensity to churn. We create a feature that checks the last login of a user and store it in "last_login column.

In [None]:
user_logs.head()

In [None]:
get_data_describe(user_logs)

In [None]:
user_logs['date'] =  fix_time_in_df(user_logs, column_name='date', expand=False)
user_logs_transformed = get_fix_skew_with_log(user_logs, ['num_25','num_50','num_75','num_985','num_100','num_unq','total_secs'], 
                                              replace_inf = True, replace_inf_with = 0)
user_logs_transformed.head()

In [None]:
get_data_describe(user_logs_transformed)

In [None]:
user_logs_transformed_base = get_groupby(user_logs_transformed,'msno', agg_dict=None, agg_func = 'mean', simple_agg_flag=True, reset_index=True)
user_logs_transformed_base.head()

In [None]:
agg_dict = { 'date':['count','max'] }
user_logs_transformed_dates = get_groupby(user_logs_transformed,'msno', agg_dict=agg_dict, agg_func = 'mean', simple_agg_flag=False, reset_index=True)
user_logs_transformed_dates.columns = user_logs_transformed_dates.columns.droplevel()
user_logs_transformed_dates.rename(columns = {'count':'login_freq', 'max': 'last_login'}, inplace = True)
user_logs_transformed_dates.reset_index(inplace=True)
user_logs_transformed_dates.drop('index',inplace=True,axis=1)
user_logs_transformed_dates.columns = ['msno','login_freq','last_login']
user_logs_transformed_dates.head()

In [None]:
user_logs_final = get_merge(user_logs_transformed_base, user_logs_transformed_dates, on = 'msno') 
user_logs_final.head()

### Joining the dataset

In [None]:
print(members.shape)
print(train.shape)
print(transactions_features.shape)
print(user_logs_final.shape)

In [None]:
%%time
train_df_v01 = get_merge(members, train, on='msno', axis=1, how='inner')
train_df_v02 = get_merge(train_df_v01, transactions_features, on='msno', axis=1, how='inner')
train_df_final = get_merge(train_df_v02, user_logs_final, on='msno', axis=1, how='inner')
train_df_final.head()

#### Registration Duration
* It is important to understand how long the customer has been part of the system. We can calculate it using the columns 'membership_expire_date_max' &  'registration_init_time'

In [None]:
train_df_final['registration_duration'] = get_two_column_operations(train_df_final, 'membership_expire_date_max', 'registration_init_time', "-")
train_df_final['registration_duration'] = get_timedelta_division(train_df_final, "registration_duration", td_type='D')
train_df_final['registration_duration'] = get_convert_column_dtype(train_df_final, 'registration_duration', data_type='int')

In [None]:
train_df_final.head()

In [None]:
%%time
get_data_profile(train_df_final,html_save_path=None, 
                     embed_in_cell=True,take_sample=False, sample_frac=0.01, 
                dataframe_name='train_df_final')

### Saving the dataset

In [None]:
%%time
get_save_intermediate_data(train_df_final, path=final_processed_data_path, filename="final_train_data_process")