# Feature Engineering (App Data)

This notebook consolidates the feature engineering workflow and prepares a
model-ready dataset for churn prediction (`enrolled`).

In [1]:
import sys
from pathlib import Path

# Add project root to Python path
ROOT_DIR = Path.cwd().parent
sys.path.append(str(ROOT_DIR))

import pandas as pd
import numpy as np

from src.preprocessing import AppDataConfig, basic_feature_engineering, create_behavioral_features, train_test_split_df

# Paths
NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent
DATA_DIR = BASE_DIR / "Dataset"

APPDATA_PATH = DATA_DIR / "appdata10.csv"
TOP_SCREENS_PATH = DATA_DIR / "top_screens.csv"

APPDATA_PATH, TOP_SCREENS_PATH

(PosixPath('/Users/fjo/Desktop/Dissertation/Dataset/appdata10.csv'),
 PosixPath('/Users/fjo/Desktop/Dissertation/Dataset/top_screens.csv'))

In [2]:
df = pd.read_csv(APPDATA_PATH)
config = AppDataConfig(target_col="enrolled")
df_processed = basic_feature_engineering(df, config)


In [3]:
df_processed = create_behavioral_features(df_processed)

df_processed[["engagement_intensity", "is_active_user"]].describe()

Unnamed: 0,engagement_intensity,is_active_user
count,50000.0,50000.0
mean,0.726736,0.24856
std,0.621155,0.432182
min,0.013158,0.0
25%,0.297297,0.0
50%,0.566667,0.0
75%,0.964286,0.0
max,15.47619,1.0


## 1. Load raw data

In [4]:
df = pd.read_csv(APPDATA_PATH)
top_screens = pd.read_csv(TOP_SCREENS_PATH)

print("Main dataset shape:", df.shape)
print("Top screens shape:", top_screens.shape)

df.head(), top_screens.head()

Main dataset shape: (50000, 12)
Top screens shape: (58, 2)


(     user               first_open  dayofweek       hour  age  \
 0  235136  2012-12-27 02:14:51.273          3   02:00:00   23   
 1  333588  2012-12-02 01:16:00.905          6   01:00:00   24   
 2  254414  2013-03-19 19:19:09.157          1   19:00:00   23   
 3  234192  2013-07-05 16:08:46.354          4   16:00:00   28   
 4   51549  2013-02-26 18:50:48.661          1   18:00:00   31   
 
                                          screen_list  numscreens  minigame  \
 0  idscreen,joinscreen,Cycle,product_review,ScanP...          15         0   
 1  joinscreen,product_review,product_review2,Scan...          13         0   
 2                                  Splash,Cycle,Loan           3         0   
 3  product_review,Home,product_review,Loan3,Finan...          40         0   
 4  idscreen,joinscreen,Cycle,Credit3Container,Sca...          32         0   
 
    used_premium_feature  enrolled            enrolled_date  liked  
 0                     0         0                      N

## 2. Screen list features

In [5]:
TOP_SCREENS_COL = "top_screens"

top_screens_list = top_screens[TOP_SCREENS_COL].astype(str).tolist()
len(top_screens_list), top_screens_list[:10]

(58,
 ['Loan2',
  'location',
  'Institutions',
  'Credit3Container',
  'VerifyPhone',
  'BankVerification',
  'VerifyDateOfBirth',
  'ProfilePage',
  'VerifyCountry',
  'Cycle'])

In [6]:
df["screen_list"] = df["screen_list"].astype(str)

for screen in top_screens_list:
    col_name = screen.strip()
    if not col_name:
        continue
    df[col_name] = df["screen_list"].str.contains(col_name, na=False).astype(int)

df.shape

(50000, 70)

In [7]:
top_screen_cols = [s for s in top_screens_list if s in df.columns]
df["num_top_screens_visited"] = df[top_screen_cols].sum(axis=1)

# Screens not in the top list
df["num_other_screens"] = df["numscreens"] - df["num_top_screens_visited"]
df["num_other_screens"] = df["num_other_screens"].clip(lower=0)

In [8]:
df[["numscreens", "num_top_screens_visited", "num_other_screens"]].describe()

Unnamed: 0,numscreens,num_top_screens_visited,num_other_screens
count,50000.0,50000.0,50000.0
mean,21.0959,7.83002,13.27234
std,15.728812,5.097565,11.975931
min,1.0,0.0,0.0
25%,10.0,4.0,5.0
50%,18.0,7.0,10.0
75%,28.0,10.0,18.0
max,325.0,45.0,280.0


## 3. Advanced screen behavior features

In [9]:
def parse_screen_list(x):
    if not isinstance(x, str):
        return []
    return [s.strip() for s in x.split(",") if s.strip()]

screen_tokens = df["screen_list"].apply(parse_screen_list)

Do longer and more repetitive onboarding navigation sequences reflect higher user engagement, or do they indicate friction and difficulty progressing through the onboarding process?

In [10]:
# total screens visited, number of unique screens, and repetition ratio
df["screen_list_length"] = screen_tokens.apply(len)
df["unique_screens_count"] = screen_tokens.apply(lambda x: len(set(x)))
df["repeat_screen_ratio"] = df["screen_list_length"] / df["unique_screens_count"].replace(0, 1)

Does the screen where users begin and end their onboarding session reveal systematic entry points and drop-off locations associated with early churn?

In [11]:
# Extract the first and last screen visited in each user session
df["first_screen"] = screen_tokens.apply(lambda x: x[0] if x else None)
df["last_screen"] = screen_tokens.apply(lambda x: x[-1] if x else None)

Do interactions with different functional screen categories during onboarding correspond to distinct behavioural patterns linked to completion or abandonment?

In [14]:
# Define screen categories based on application functionality
loan_screens = ["Loan2", "Loan3", "Loan4", "Loan5", "Cycle", "Cycle1", "Credit3Container"]
financial_screens = ["Institutions", "BankVerification", "BankVerify", "VerifyPhone"]
profile_screens = ["ProfilePage", "VerifyDateOfBirth", "VerifyCountry"]
onboarding_screens = ["Location", "Splash", "Home"]

Does the presence and availability of different screen categories in a user’s interaction data reflect meaningful behavioural pathways during onboarding?

In [15]:
# Select available columns corresponding to each screen category
loan_cols = [c for c in loan_screens if c in df.columns]
financial_cols = [c for c in financial_screens if c in df.columns]
profile_cols = [c for c in profile_screens if c in df.columns]
onboarding_cols = [c for c in onboarding_screens if c in df.columns]

Do the frequencies of interactions with different screen categories distinguish users who complete onboarding from those who churn early?

In [16]:
# Count how many times each screen category was visited per user
df["loan_screen_count"] = df[loan_cols].sum(axis=1) if loan_cols else 0
df["financial_screen_count"] = df[financial_cols].sum(axis=1) if financial_cols else 0
df["profile_screen_count"] = df[profile_cols].sum(axis=1) if profile_cols else 0
df["onboarding_screen_count"] = df[onboarding_cols].sum(axis=1) if onboarding_cols else 0

Does the relative distribution of user attention across screen categories reveal behavioural focus patterns associated with onboarding success or abandonment?

In [16]:
# Compute relative ratios of screen category interactions
# Normalised by total number of screens to capture behavioural focus
denom = df["numscreens"].replace(0, 1)
df["loan_screen_ratio"] = df["loan_screen_count"] / denom
df["financial_screen_ratio"] = df["financial_screen_count"] / denom
df["profile_screen_ratio"] = df["profile_screen_count"] / denom
df["onboarding_screen_ratio"] = df["onboarding_screen_count"] / denom
df["top_screen_ratio"] = df["num_top_screens_visited"] / denom

Does the category of the first and last screen visited during onboarding reveal systematic entry points and exit patterns associated with early churn or successful progression?

In [17]:
# Create binary indicators for the category of the first and last visited screen
df["first_screen_is_loan"] = df["first_screen"].isin(loan_screens).astype(int)
df["first_screen_is_financial"] = df["first_screen"].isin(financial_screens).astype(int)
df["first_screen_is_profile"] = df["first_screen"].isin(profile_screens).astype(int)

df["last_screen_is_loan"] = df["last_screen"].isin(loan_screens).astype(int)
df["last_screen_is_profile"] = df["last_screen"].isin(profile_screens).astype(int)

Do coarse-grained indicators of engagement and feature interaction effectively distinguish between users who complete onboarding and those who disengage early?

In [18]:
# Create high-level behavioural flags capturing engagement and interaction presence
df["is_heavy_user"] = (df["numscreens"] > df["numscreens"].median()).astype(int)
df["any_topscreen_interaction"] = (df["num_top_screens_visited"] > 0).astype(int)
df["any_loan_interaction"] = (df["loan_screen_count"] > 0).astype(int)
df["any_financial_interaction"] = (df["financial_screen_count"] > 0).astype(int)
# Preview selected engineered behavioural features
df[[
    "screen_list_length",
    "unique_screens_count",
    "repeat_screen_ratio",
    "loan_screen_count",
    "financial_screen_count",
    "top_screen_ratio",
    "is_heavy_user",
]].head()

Unnamed: 0,screen_list_length,unique_screens_count,repeat_screen_ratio,loan_screen_count,financial_screen_count,top_screen_ratio,is_heavy_user
0,13,13,1.0,2,1,0.466667,0
1,11,11,1.0,1,2,0.538462,0
2,3,3,1.0,1,0,1.0,0
3,17,16,1.0625,1,1,0.3,1
4,19,18,1.055556,3,3,0.34375,1


## 3.1 Advanced screen behavior features created

Below we list the engineered feature names and show a preview of the new columns.

In [20]:
advanced_feature_cols = [
    "screen_list_length",
    "unique_screens_count",
    "repeat_screen_ratio",
    "first_screen",
    "last_screen",
    "loan_screen_count",
    "financial_screen_count",
    "profile_screen_count",
    "onboarding_screen_count",
    "loan_screen_ratio",
    "financial_screen_ratio",
    "profile_screen_ratio",
    "onboarding_screen_ratio",
    "top_screen_ratio",
    "first_screen_is_loan",
    "first_screen_is_financial",
    "first_screen_is_profile",
    "last_screen_is_loan",
    "last_screen_is_profile",
    "is_heavy_user",
    "any_topscreen_interaction",
    "any_loan_interaction",
    "any_financial_interaction",
]

print("Advanced screen behavior features created:")
print("- " + "- ".join(advanced_feature_cols))

df[advanced_feature_cols].head()

Advanced screen behavior features created:
- screen_list_length- unique_screens_count- repeat_screen_ratio- first_screen- last_screen- loan_screen_count- financial_screen_count- profile_screen_count- onboarding_screen_count- loan_screen_ratio- financial_screen_ratio- profile_screen_ratio- onboarding_screen_ratio- top_screen_ratio- first_screen_is_loan- first_screen_is_financial- first_screen_is_profile- last_screen_is_loan- last_screen_is_profile- is_heavy_user- any_topscreen_interaction- any_loan_interaction- any_financial_interaction


Unnamed: 0,screen_list_length,unique_screens_count,repeat_screen_ratio,first_screen,last_screen,loan_screen_count,financial_screen_count,profile_screen_count,onboarding_screen_count,loan_screen_ratio,...,top_screen_ratio,first_screen_is_loan,first_screen_is_financial,first_screen_is_profile,last_screen_is_loan,last_screen_is_profile,is_heavy_user,any_topscreen_interaction,any_loan_interaction,any_financial_interaction
0,13,13,1.0,idscreen,Login,2,1,1,0,0.133333,...,0.466667,0,0,0,0,0,0,1,1,1
1,11,11,1.0,joinscreen,Loan2,1,2,2,0,0.076923,...,0.538462,0,0,0,1,0,0,1,1,1
2,3,3,1.0,Splash,Loan,1,0,0,1,0.333333,...,1.0,0,0,0,0,0,0,1,1,0
3,17,16,1.0625,product_review,Credit2,1,1,1,0,0.025,...,0.3,0,0,0,0,0,1,1,1,1
4,19,18,1.055556,idscreen,product_review,3,3,1,0,0.09375,...,0.34375,0,0,0,0,0,1,1,1,1


## 4. Apply baseline feature engineering

In [21]:
config = AppDataConfig(target_col="enrolled")

df_processed = basic_feature_engineering(df, config)

print("Processed shape:", df_processed.shape)
df_processed.head()

Processed shape: (50000, 91)


Unnamed: 0,dayofweek,hour,age,numscreens,minigame,used_premium_feature,enrolled,liked,Loan2,location,...,top_screen_ratio,first_screen_is_loan,first_screen_is_financial,first_screen_is_profile,last_screen_is_loan,last_screen_is_profile,is_heavy_user,any_topscreen_interaction,any_loan_interaction,any_financial_interaction
0,3,2,23,15,0,0,0,0,1,0,...,0.466667,0,0,0,0,0,0,1,1,1
1,6,1,24,13,0,0,0,0,1,1,...,0.538462,0,0,0,1,0,0,1,1,1
2,1,19,23,3,0,1,0,1,0,0,...,1.0,0,0,0,0,0,0,1,1,0
3,4,16,28,40,0,0,1,0,0,1,...,0.3,0,0,0,0,0,1,1,1,1
4,1,18,31,32,0,0,1,1,1,0,...,0.34375,0,0,0,0,0,1,1,1,1


## 5. Time-of-day and age-group features

In [22]:
def map_time_of_day(h):
    if 5 <= h < 12:
        return "morning"
    if 12 <= h < 17:
        return "afternoon"
    if 17 <= h < 21:
        return "evening"
    return "night"

df_processed["time_of_day"] = df_processed["hour"].apply(map_time_of_day)

df_processed = pd.get_dummies(df_processed, columns=["time_of_day"], drop_first=True)

df_processed["is_young"] = (df_processed["age"] < 25).astype(int)
df_processed["is_middle_age"] = (
    (df_processed["age"] >= 25) & (df_processed["age"] <= 40)
).astype(int)
df_processed["is_senior"] = (df_processed["age"] > 40).astype(int)

df_processed.head()

Unnamed: 0,dayofweek,hour,age,numscreens,minigame,used_premium_feature,enrolled,liked,Loan2,location,...,is_heavy_user,any_topscreen_interaction,any_loan_interaction,any_financial_interaction,time_of_day_evening,time_of_day_morning,time_of_day_night,is_young,is_middle_age,is_senior
0,3,2,23,15,0,0,0,0,1,0,...,0,1,1,1,False,False,True,1,0,0
1,6,1,24,13,0,0,0,0,1,1,...,0,1,1,1,False,False,True,1,0,0
2,1,19,23,3,0,1,0,1,0,0,...,0,1,1,0,True,False,False,1,0,0
3,4,16,28,40,0,0,1,0,0,1,...,1,1,1,1,False,False,False,0,1,0
4,1,18,31,32,0,0,1,1,1,0,...,1,1,1,1,True,False,False,0,1,0


## 6. Train / test split

In [23]:
X_train, X_test, y_train, y_test = train_test_split_df(
    df_processed,
    config=config,
    test_size=0.2,
    random_state=42,
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40000, 96), (10000, 96), (40000,), (10000,))

## 7. Save processed data

In [24]:
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(exist_ok=True)

X_train.to_csv(PROCESSED_DIR / "X_train.csv", index=False)
X_test.to_csv(PROCESSED_DIR / "X_test.csv", index=False)
y_train.to_csv(PROCESSED_DIR / "y_train.csv", index=False)
y_test.to_csv(PROCESSED_DIR / "y_test.csv", index=False)

print("Saved to:", PROCESSED_DIR)

Saved to: /Users/loriksfishta/Dissertation/Dataset/processed
