### Models built for check-in in next month prediction (without the imbalanced dataset handling, but use class weight adjustment in the models)
### Training dataset is up to end of Jun.2024 to see how models predict Jul.2024 check-in

In [1]:
### read in the necessary files
import pandas as pd
usr_segment = pd.read_csv("Jun_segmentation.csv")
features = pd.read_csv("features.csv")
openapp_recency = pd.read_csv("openapp_recency.csv")

In [2]:
merged_1 = pd.merge(usr_segment, features, on='user_sn', how='inner')
merged_2 = pd.merge(merged_1, openapp_recency, on='user_sn', how='inner')

In [3]:
merged_2.dtypes

user_sn                        int64
segmentation                   int64
hourly_ci_number               int64
ovn_ci_number                  int64
day_ci_number                  int64
total_paid_from_user           int64
usr_cancel_num_3_months        int64
hotel_cancel_num_3_months      int64
g2j_cancel_num_3_months        int64
no_show_num_3_months           int64
recency                        int64
average_ci_time_gap          float64
std_ci_time_gap              float64
open_app_num_3_months        float64
mileage_used_num             float64
current_mileage_point        float64
search_num_3_months          float64
review_num_3_months          float64
avg_mark                     float64
time_since_join                int64
user_province                 object
ci_num                         int64
openapp_recency                int64
dtype: object

In [4]:
merged_2.isnull().sum()

user_sn                           0
segmentation                      0
hourly_ci_number                  0
ovn_ci_number                     0
day_ci_number                     0
total_paid_from_user              0
usr_cancel_num_3_months           0
hotel_cancel_num_3_months         0
g2j_cancel_num_3_months           0
no_show_num_3_months              0
recency                           0
average_ci_time_gap               0
std_ci_time_gap                   0
open_app_num_3_months        117067
mileage_used_num                239
current_mileage_point         50134
search_num_3_months          156746
review_num_3_months          103449
avg_mark                     103449
time_since_join                   0
user_province                     2
ci_num                            0
openapp_recency                   0
dtype: int64

### feature explanation
- segmentation: user segmentation by the month of train period

- hourly_ci_number: user's number of hourly check-ins

- ovn_ci_number: user's number of ovn check-ins 

- day_ci_number: user's number of day check-ins 

- total_paid_from_user: user's paid amount 

- usr_cancel_num_3_months: number of user cancels in the last 3 months in train period month

- hotel_cancel_num_3_months: number of user cancels in the last 3 months in train period month

- g2j_cancel_num_3_months: number of user cancels in the last 3 months  in train period month

- no_show_num_3_months: number of noshow in the last 3 months in train period month

- recency: the day difference between last check-in date before end of train period month and end of train period month

- average_ci_time_gap: average day gap between check-ins

- std_ci_time_gap: standard deviation of day gaps between check-ins

- open_app_num_3_months: number of times user open app in the last 3 months in the train period month

- mileage_used_num: number of times users used mileage points until end of train period month

- current_mileage_point: mileage points of users by the end of train period month

- search_num_3_months: number of times user search in the last 3 months in the train period month

- review_num_3_months: number of times user give review in the last 3 months in the train period month

- time_since_join: day gap between user's register date and end of train period month

- user_province: Province of user

- openapp_recency: the day difference between last open-app date before end of train period month and end of train period month

- ci_num: number of user's check-ins in next month period (used to create target variable)

## Cleaning data

In [7]:
## Fill the null values with 0 for open_app_num_3_months, mileage_used_num, current_mileage_point
## search_num_3_months, review_num_3_months, avg_mark
merged_2[['open_app_num_3_months', 'mileage_used_num', 'current_mileage_point', 
          'review_num_3_months','search_num_3_months']] = merged_2[['open_app_num_3_months', 'mileage_used_num', 'current_mileage_point',
                                                            'review_num_3_months','search_num_3_months',]].fillna(0).astype('int64')

merged_2['avg_mark'] = merged_2['avg_mark'].fillna(0).astype('float32')

In [8]:
# Get rid of records with time_since_join or average_ci_time_gap < 0 and records with user_province as null
merged_2 = merged_2[merged_2['time_since_join']>=0]
merged_2 = merged_2[merged_2['average_ci_time_gap']>=0]
merged_2 = merged_2[merged_2['user_province'].isnull()==False]

In [9]:
segment_data = {
    'segmentation': [1, 2, 3, 4, 5, 6, 7],
    'segmentation_text': ['New', 'Existing', 'Retention', 'Win-back', 'Churn', 'Drop', 'Dormant']
}

segment_df = pd.DataFrame(segment_data)

# add the segementaion test
cleaned_df = pd.merge(merged_2, segment_df, on='segmentation', how='left')
cleaned_df.shape

(265085, 24)

### Create target/dependent variable and train test split

In [10]:
cleaned_df['ci_or_not'] = cleaned_df['ci_num'].apply(lambda x: 1 if x > 0 else 0)
cleaned_df['ci_or_not'].value_counts()

ci_or_not
0    242262
1     22823
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = cleaned_df.drop(columns=['user_sn','segmentation', 'ci_or_not', 'ci_num'],axis=1)
y = cleaned_df['ci_or_not']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32, stratify=y)

In [12]:
num_features = X_train.select_dtypes(exclude="object").columns
cat_features = X_train.select_dtypes(include="object").columns

print('We have {} numerical features : {}'.format(len(num_features), num_features))
print('\nWe have {} categorical features : {}'.format(len(cat_features), cat_features))

We have 19 numerical features : Index(['hourly_ci_number', 'ovn_ci_number', 'day_ci_number',
       'total_paid_from_user', 'usr_cancel_num_3_months',
       'hotel_cancel_num_3_months', 'g2j_cancel_num_3_months',
       'no_show_num_3_months', 'recency', 'average_ci_time_gap',
       'std_ci_time_gap', 'open_app_num_3_months', 'mileage_used_num',
       'current_mileage_point', 'search_num_3_months', 'review_num_3_months',
       'avg_mark', 'time_since_join', 'openapp_recency'],
      dtype='object')

We have 2 categorical features : Index(['user_province', 'segmentation_text'], dtype='object')


### Preprocessing and train for Tree-based models

In [13]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
oh_transformer = OneHotEncoder()
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),    
        ('pass', 'passthrough', num_features)
    ]
)

In [14]:
from sklearn.ensemble import RandomForestClassifier

class_weight = {0: 1, 1: 10} 
rf = RandomForestClassifier(n_estimators=150, min_samples_split=7, min_samples_leaf=5, criterion='gini',
                            max_features='sqrt', max_samples=0.8, bootstrap=True, class_weight=class_weight, random_state=2)

processed_X_train = preprocessor.fit_transform(X_train)
processed_X_test = preprocessor.transform(X_test)


rf.fit(processed_X_train, y_train) 

y_test_pred = rf.predict(processed_X_test)

report = classification_report(y_test, y_test_pred)
print("Test Classification Report:")
print(report)


Test Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.91      0.94     48452
           1       0.42      0.71      0.53      4565

    accuracy                           0.89     53017
   macro avg       0.70      0.81      0.73     53017
weighted avg       0.92      0.89      0.90     53017



##### Good precision and recall for 0 (no checkin in next month) class
##### Precison of 0.42 for 1 (checkin in next month) class -> out of all next month check-in predictions, model gets 42% of predictions are correct
##### Recall of 0.71 for 1 (checkin in next month) class -> out of all next month actual check-ins, model gets 71% of actual check-ins

In [16]:
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import classification_report


processed_X_train = preprocessor.fit_transform(X_train)
processed_X_test = preprocessor.transform(X_test)
# Initialize and train the EasyEnsemble model
eec = EasyEnsembleClassifier(n_estimators=150, random_state=42)
eec.fit(processed_X_train, y_train)

# Make predictions and evaluate
y_pred = eec.predict(processed_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.83      0.90     48452
           1       0.31      0.83      0.46      4565

    accuracy                           0.83     53017
   macro avg       0.65      0.83      0.68     53017
weighted avg       0.92      0.83      0.86     53017



##### Good precision and recall for 0 (no checkin in next month) class
##### Precison of 0.31 for 1 (checkin in next month) class -> out of all next month check-in predictions, model gets 31% of predictions are correct
##### Recall of 0.83 for 1 (checkin in next month) class -> out of all next month actual check-ins, model gets 83% of actual check-ins

### Preprocessing and train for non-tree-based models

In [18]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

oh_transformer = OneHotEncoder()
numeric_transformer = StandardScaler()


preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [27]:
from sklearn.linear_model import LogisticRegression

class_weight = {0: 1, 1: 10} 
model = LogisticRegression(class_weight=class_weight, random_state=42)

processed_X_train = preprocessor.fit_transform(X_train)
processed_X_test = preprocessor.transform(X_test)

model.fit(processed_X_train, y_train)

y_test_pred = model.predict(processed_X_test)

report = classification_report(y_test, y_test_pred)
print("Test Classification Report:")
print(report)


Test Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.83      0.90     48452
           1       0.31      0.81      0.45      4565

    accuracy                           0.83     53017
   macro avg       0.65      0.82      0.67     53017
weighted avg       0.92      0.83      0.86     53017



##### Good precision and recall for 0 (no checkin in next month) class
##### Precison of 0.31 for 1 (checkin in next month) class -> out of all next month check-in predictions, model gets 31% of predictions are correct
##### Recall of 0.81 for 1 (checkin in next month) class -> out of all next month actual check-ins, model gets 81% of actual check-ins

In [24]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout


class_weight = {0: 1.0, 1: 5.0} 

processed_X_train = preprocessor.fit_transform(X_train)
processed_X_test = preprocessor.transform(X_test)

# Stratified split into training and validation sets
nn_X_train, nn_X_val, nn_y_train, nn_y_val = train_test_split(processed_X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

# Convert the data to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((nn_X_train.toarray(), nn_y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((nn_X_val.toarray(), nn_y_val))


batch_size = 32
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

# Define a simple ANN model
model = Sequential([
    Dense(64, activation='relu', kernel_initializer='glorot_uniform', input_shape=(nn_X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_initializer='glorot_uniform'),
    Dropout(0.3),
    Dense(1, activation='sigmoid', kernel_initializer='glorot_uniform')
])


# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using the stratified validation set
model.fit(train_dataset, validation_data=val_dataset, epochs=15, class_weight=class_weight)

# Make predictions
y_pred_probs = model.predict(processed_X_test)
y_pred = (y_pred_probs >= 0.5).astype(int)  # Convert probabilities to class labels

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Epoch 1/15


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5302/5302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 492us/step - accuracy: 0.8649 - loss: 0.5628 - val_accuracy: 0.8764 - val_loss: 0.2712
Epoch 2/15
[1m5302/5302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 466us/step - accuracy: 0.8793 - loss: 0.5127 - val_accuracy: 0.8888 - val_loss: 0.2537
Epoch 3/15
[1m5302/5302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 468us/step - accuracy: 0.8827 - loss: 0.5048 - val_accuracy: 0.8915 - val_loss: 0.2445
Epoch 4/15
[1m5302/5302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 470us/step - accuracy: 0.8827 - loss: 0.5026 - val_accuracy: 0.8866 - val_loss: 0.2522
Epoch 5/15
[1m5302/5302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 473us/step - accuracy: 0.8839 - loss: 0.4982 - val_accuracy: 0.8919 - val_loss: 0.2458
Epoch 6/15
[1m5302/5302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 470us/step - accuracy: 0.8840 - loss: 0.5009 - val_accuracy: 0.8913 - val_loss: 0.2482
Epoch 7/15
[1m

##### Good precision and recall for 0 (no checkin in next month) class
##### Precison of 0.42 for 1 (checkin in next month) class -> out of all next month check-in predictions, model gets 42% of predictions are correct
##### Recall of 0.71 for 1 (checkin in next month) class -> out of all next month actual check-ins, model gets 71% of actual check-ins