# Import necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, auc
import sys
import os

# Add the parent directory's 'scripts' folder to the Python path
sys.path.append(os.path.abspath('../scripts'))

# Import the function
from class_metrics import display_metrics

# Load the the users with segments dataset

In [2]:
df = pd.read_csv('../data/cleaned/users_usage_segmented.csv')

In [3]:
sorted(df.columns)

['avg_data_after_upgrade',
 'avg_data_before_upgrade',
 'days_active_after',
 'increased_usage',
 'recharge_growth',
 'std_after',
 'std_before',
 'time_to_first_data_use',
 'total_recharge_after',
 'total_recharge_before',
 'user',
 'user_segment']

### we need to drop the columns that are not needed for the analysis

> 'avg_data_after_upgrade','days_active_after','std_after','time_to_first_data_use','user_segment', 'total_recharge_after',

so we only need the following columns

> 'avg_data_before_upgrade','increased_usage'(our target),'std_before','total_recharge_before','user',

In [4]:
df =  df[['user', 'avg_data_before_upgrade', 'std_before', 'total_recharge_before', 'increased_usage']]
df.sample(10)

Unnamed: 0,user,avg_data_before_upgrade,std_before,total_recharge_before,increased_usage
2,225798765434,,,,0
31,225798765464,0.0,,100.0,0
52,225798765485,,,,0
33,225798765466,0.0,,200.0,0
53,225798765486,4796.330187,19727.86841,4054.0,1
1,225798765433,,,,0
12,225798765444,0.0,,100.0,0
0,225798765432,0.130697,0.452746,3091.482637,1
48,225798765481,,,,0
17,225798765450,0.0,0.0,250.0,0


Logistic, RandomForest or any other tree-based model from sklearn library don't support nan values and we have them

In [5]:
df.isna().sum()

user                        0
avg_data_before_upgrade    25
std_before                 34
total_recharge_before      25
increased_usage             0
dtype: int64

### Handle Missing values

we have missing values in 3 columns, we can't impute them bcs we might introduce false informations about users. 

**Solution:**

For each column with missing values let's create a binary label/flag indicating whether it's missing or not, then after let's fill nan with 0 which indicates we don't have values for that column

In [6]:
df['data_flag'] = df['avg_data_before_upgrade'].isna().astype(int)
df['std_flag'] = df['std_before'].isna().astype(int)
df['recharge_flag'] = df['total_recharge_before'].isna().astype(int)

# Fill missing values with 0

df['avg_data_before_upgrade'] = df['avg_data_before_upgrade'].fillna(0)
df['std_before'] = df['std_before'].fillna(0)
df['total_recharge_before'] = df['total_recharge_before'].fillna(0)

df.isna().sum()


user                       0
avg_data_before_upgrade    0
std_before                 0
total_recharge_before      0
increased_usage            0
data_flag                  0
std_flag                   0
recharge_flag              0
dtype: int64

In [7]:
df.to_csv('../data/cleaned/users_segmented_zero_nan.csv', index=False) 

# Train & Test split

In [13]:
X = df.drop(columns=['user', 'increased_usage'])
y = df['increased_usage']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 ,shuffle=True)
X_train.shape

(45, 6)

# Baseline Model : RandomForest Classifier

Let's train & predict using a RandomForest regressor

In [27]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_scores = rf.predict_proba(X_test)[:, 1]

overfitness check

In [24]:
y_train_pred = rf.predict(X_train)
y_train_score = rf.predict_proba(X_train)[:, 1]

#### Baseline Metrics of the RandomForest model

In [28]:
print('************************Test metrics***********************')
display_metrics(y_true=y_test, y_pred=y_pred, y_scores=y_scores)

print('************************Train metrics***********************')
display_metrics(y_true=y_train, y_pred=y_train_pred , y_scores=y_train_score)

print('************************Feature importance***********************')
importances = rf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

************************Test metrics***********************
Classification Metrics:
Accuracy: 0.8333
Precision: 1.0000
Recall: 0.5000
AUC-ROC: 0.8750
************************Train metrics***********************
Classification Metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
AUC-ROC: 1.0000
************************Feature importance***********************


Unnamed: 0,Feature,Importance
0,avg_data_before_upgrade,0.557369
1,std_before,0.174368
2,total_recharge_before,0.144223
3,data_flag,0.057102
5,recharge_flag,0.049357
4,std_flag,0.01758
