# Import necessary libraries

In [1]:
import pandas as pd

# Load The MTN Upsell dataset

In [2]:
# df = pd.read_excel('../data/raw/mtn_upsell_uncleaned.xlsx')
df = pd.read_csv('../data/raw/mtn_upsell_generated_mod.csv', low_memory=False)

In [3]:
df.isnull().sum()

dates                      0
Phone Number               0
total_reloads              0
total_reload_amount    21031
imei                       0
brand_name                 0
model_name                 0
device_category            0
data_kb                    0
dtype: int64

# Data Cleaning

As you see we have 21k+ nan rows in 'total_reload_amount' column. Common causes are

1. System recording error – Reload was counted, but the amount wasn’t saved.

2. Free bonus/airdrop – A “reload” without money involved (rare).

3. Incomplete data – Some recharge transactions didn’t sync fully.

So i think it's an incomplete data problem, i decide to impute those nan values by the mean of people who reloaded 1 time

In [4]:
mean_total_reloads_amount = df[(df['total_reloads'] == 1) & (df['total_reload_amount'].notna())]['total_reload_amount'].mean()
float(mean_total_reloads_amount)

304.66826066455485

In [5]:
df['total_reload_amount'] = df['total_reload_amount'].fillna(mean_total_reloads_amount)

In [6]:
print(f"Imputed missing values using mean: {mean_total_reloads_amount:.2f}")

Imputed missing values using mean: 304.67


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3004191 entries, 0 to 3004190
Data columns (total 9 columns):
 #   Column               Dtype  
---  ------               -----  
 0   dates                int64  
 1   Phone Number         int64  
 2   total_reloads        int64  
 3   total_reload_amount  float64
 4   imei                 int64  
 5   brand_name           object 
 6   model_name           object 
 7   device_category      object 
 8   data_kb              float64
dtypes: float64(2), int64(4), object(3)
memory usage: 206.3+ MB


**Columns types**

column device_category have different types and it was supposed to be int() let's find out why

Caused by char (-) , let me fill all rows including - with 6 and change the type to int

In [8]:
df['device_category'].value_counts()
df['device_category'] = df['device_category'].replace('-','6')
df['device_category'] = df['device_category'].astype(int)

#### Let's find upgraded date for each user

In [9]:
# convert dates columns to datetime
df['dates'] = pd.to_datetime(df['dates'], format='%Y%m%d')

# Sort the DataFrame by 'Phone Number' and 'dates'
df = df.sort_values(by=['Phone Number', 'dates'])


Identify smartphones and those that aren't

In [10]:
df['isSmartphone'] = df['device_category'] == 5


For each user, find the first date they used a smartphone

In [11]:
dates_upgraded = df[df['isSmartphone']].groupby('Phone Number')['dates'].min().reset_index()
dates_upgraded = dates_upgraded.rename(columns={'dates': 'dates_upgraded', 'Phone Number': 'user'})

Let's get users who used feature/basic phone before

In [12]:
feature_or_basic = df[df['device_category'].isin([4,7])]['Phone Number'].unique()

feature_or_basic.shape

(20042,)

Filter dates_upgraded to include only those who had a basic/feature phone first

In [13]:
dates_upgraded =  dates_upgraded[dates_upgraded['user'].isin(feature_or_basic)]

Store the dates_upgraded DataFrame in a CSV file

In [14]:
dates_upgraded.to_csv('../data/cleaned/users_and_date_upgraded.csv', index=False)

#### Build Usage & Recharge Features

For each user in dates_upgraded determine

1. Data used before upgrade => dt_before
2. Data used after upgrade => dt_after
3. Standard deviation of data usage before & after => std_before & std_after
4. Days active after the upgrade where data_kb > 0
5. Recharge growth determined by total_reload_amount (after)-(before)
6. Days passed to first data use = Days between upgrade and first data_kb>0

why standard deviation before & after ?:  This will tell us how consistent or unstable user's data usage behaviour is

> low std : User has a steady behaviour

> high std :  User has unpredicatble behaviour

In short:

I'm not just measuring how much data someone used, but how stable or risky their behavior was.

##### Merge date_upgraded with the original df

In [15]:
#Firstly, we need to rename the 'Phone Number' column to 'user' in the original DataFrame
df = df.rename(columns={'Phone Number': 'user'})

df = df.merge(dates_upgraded, on='user', how='left')

Create columns to label rows as "before" or "after" upgrade

In [16]:
df['Days_to_Upgrade'] = (df['dates']- df['dates_upgraded']).dt.days

df['isBefore'] = df['Days_to_Upgrade'].between(-30, -1) #30 days before upgrade
df['isAfter'] = df['Days_to_Upgrade'].between(0, 60) #60 days after upgrade

df[(df['isBefore'] | df['isAfter'])].head()

Unnamed: 0,dates,user,total_reloads,total_reload_amount,imei,brand_name,model_name,device_category,data_kb,isSmartphone,dates_upgraded,Days_to_Upgrade,isBefore,isAfter
0,2024-09-09,225798700001,2,113.47,2499020533,Brand_11,Brand_11_Model_2,7,156802.99,False,2024-09-10,-1,True,False
1,2024-09-10,225798700001,1,137.61,1356105166,Brand_25,Brand_25_Model_2,5,32615.8,True,2024-09-10,0,False,True
2,2024-09-10,225798700001,1,171.49,1082651869,Brand_29,Brand_29_Model_2,5,1837504.58,True,2024-09-10,0,False,True
3,2024-09-17,225798700001,1,902.91,1137344381,Brand_49,Brand_49_Model_1,5,0.0,True,2024-09-10,7,False,True
4,2024-09-17,225798700001,1,41.15,1980489975,Brand_28,Brand_28_Model_6,5,67296.78,True,2024-09-10,7,False,True


Aggregate usage stats per user for both periods

1. before upgrade

In [17]:
before = df[df['isBefore']].groupby('user').agg(
    avg_data_before_upgrade=('data_kb', 'mean'),
    std_before=('data_kb', 'std'),
    total_recharge_before=('total_reload_amount', 'sum')
).reset_index()

before.head()


Unnamed: 0,user,avg_data_before_upgrade,std_before,total_recharge_before
0,225798700001,156802.99,,113.47
1,225798700012,88923.625,125756.99649,124.37
2,225798700016,152250.54,141042.183882,1885.47
3,225798700036,297134.18,,33.72
4,225798700046,816601.31,,279.67


2. after upgrade

In [18]:
after = df[df['isAfter']].groupby('user').agg(
    avg_data_after_upgrade=('data_kb', 'mean'),
    std_after=('data_kb', 'std'),
    total_recharge_after=('total_reload_amount', 'sum'),
    days_active_after=('data_kb', lambda x: (x > 0).sum()),
    time_to_first_data_use=('Days_to_Upgrade', lambda x: x[df.loc[x.index, 'data_kb'] > 0].min())
).reset_index()

after.head()

Unnamed: 0,user,avg_data_after_upgrade,std_after,total_recharge_after,days_active_after,time_to_first_data_use
0,225798700001,406223.96,645813.6,7574.878261,27,0.0
1,225798700012,471228.741111,612422.6,14863.328261,53,0.0
2,225798700014,669976.774043,1080963.0,15855.988261,45,0.0
3,225798700016,783516.914167,1818749.0,8418.27,23,0.0
4,225798700032,458873.072281,579248.6,10687.68,56,0.0


3. Merge them together

In [19]:
features = before.merge(after, on='user', how='outer')
features.head()

Unnamed: 0,user,avg_data_before_upgrade,std_before,total_recharge_before,avg_data_after_upgrade,std_after,total_recharge_after,days_active_after,time_to_first_data_use
0,225798700001,156802.99,,113.47,406223.96,645813.6,7574.878261,27,0.0
1,225798700012,88923.625,125756.99649,124.37,471228.741111,612422.6,14863.328261,53,0.0
2,225798700014,,,,669976.774043,1080963.0,15855.988261,45,0.0
3,225798700016,152250.54,141042.183882,1885.47,783516.914167,1818749.0,8418.27,23,0.0
4,225798700032,,,,458873.072281,579248.6,10687.68,56,0.0


Add Recharge Growth & Finalize

In [20]:
features['recharge_growth'] = features['total_recharge_after'] - features['total_recharge_before']

## Analysis new features dataframe

In [21]:
features.isna().sum()

user                           0
avg_data_before_upgrade    10229
std_before                 14441
total_recharge_before      10229
avg_data_after_upgrade         0
std_after                      0
total_recharge_after           0
days_active_after              0
time_to_first_data_use       508
recharge_growth            10229
dtype: int64

**We have nan values which explain some behaviours of users**

**avg_data_before_upgrade:**  10229 user didn’t use mobile data at all before upgrading

**std_before:** no variation because no data

**total_recharge_before:** no top-ups before upgrade , new users or they were inactive

**recharge_growth:** makes sense because it needs both before & after

> Note: 10229 users have no data or recharge history before upgrading. Their missing values reflect real user inactivity

***Question:*** why 14441 nan values in std_before?

> std_before needs at least 2 data points to calculate a standard deviation. Those extra 4212 users (14441 NaNs in std_before vs. 10229 in avg_data_before_upgrade)  had only one day of data in the 30-day window before their upgrade.

# save the cleaned / engineered dataframe as smartphone_users_features.csv

In [22]:
features.to_csv('../data/cleaned/smartphone_users_features.csv', index=False)