<a href="https://www.kaggle.com/code/amitanshjoshi/adtracking?scriptVersionId=134425738" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing all the necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from collections import Counter
from imblearn.combine import SMOTETomek

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn import metrics

In [None]:
train_data = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train_sample.csv')
test_data = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/test_supplement.csv')

In [None]:
test_data.head()

In [None]:
train_data.shape

In [None]:
train_data.isna().sum()/len(train_data) * 100

In [None]:
# We can get rid of the dataset with more than 80% missing value and hence removing attributed_time
train_data.drop(columns = ['attributed_time'], inplace = True)

In [None]:
train_data.head()

# 1. Performing EDA

In [None]:
sns.countplot(data = train_data, x='is_attributed')

In [None]:
train_data['is_attributed'].value_counts()

### Clearly the data is highly imbalanced and therefore, we might use the balancing technique for minority class

In [None]:
train_data.describe()

### The quartiles are increasing and thus, there might be no outliers in the data.

In [None]:
train_data.info()

In [None]:
train_data.os.value_counts()

In [None]:
def fix_dataframe(df):
    # Dropping the column attributed_time 
#     df.drop(columns=df['attributed_time'], inplace = True)
    # Converting the click time object to date time column
    df['click_time'] = pd.to_datetime(df['click_time'])
    df['month'] = df['click_time'].dt.month
    df['day'] = df['click_time'].dt.day
    df['hour'] = df['click_time'].dt.hour
    df['dayOfWeek'] = df['click_time'].dt.dayofweek
    df['dayOfYear'] = df['click_time'].dt.dayofyear
    df['seconds'] = df['click_time'].dt.second
    df.drop(columns=df['click_time'], inplace = True)
    ip_count = df.groupby('ip').size().reset_index(name='ip_count').astype('int64')
    df = pd.merge(df, ip_count, on='ip', how='left', sort=False)
    df.drop(columns=['ip'], inplace = True)
    return df

In [None]:
train_data['click_time'] = pd.to_datetime(train_data['click_time'])

In [None]:
train_data.info()

In [None]:
# extracting all the dates months from the date column
train_data['month'] = train_data['click_time'].dt.month

train_data['day'] = train_data['click_time'].dt.day
train_data['hour'] = train_data['click_time'].dt.hour
train_data['dayOfWeek'] = train_data['click_time'].dt.dayofweek
train_data['dayOfYear'] = train_data['click_time'].dt.dayofyear
train_data['seconds'] = train_data['click_time'].dt.second
ip_count = train_data.groupby('ip').size().reset_index(name='ip_count').astype('int64')
train_data = pd.merge(train_data, ip_count, on='ip', how='left', sort=False)
train_data.drop(columns=['ip'], inplace = True)

In [None]:
train_data.head()

In [None]:
sns.countplot(data = train_data, x='hour')

### It can be seen that most clicks are happening at the 4th hour of the day

In [None]:
sns.countplot(data = train_data, x='dayOfWeek')

In [None]:
train_data.device.value_counts()

In [None]:
most_click_from_device = train_data.device.value_counts()[:5]

In [None]:
most_click_from_device

### Device 1 has the most clicks in the distribution

In [None]:
train_data.drop(columns=['click_time'], inplace = True)

# 2. Data Preparation for Modelling

In [None]:
X = train_data.drop(['is_attributed'], axis = 1 )
y = train_data['is_attributed']

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

In [None]:
X_test.shape

In [None]:
smotetk = SMOTETomek()
counter = Counter(y_train)
X_train, y_train = smotetk.fit_resample(X_train, y_train)
new_counter = Counter(y_train)
print('Before count:', counter)
print('After count:', new_counter)

In [None]:
# Now that the data is balanced, we can proceed for the modelling
# Since there is high likely the chances where the misclassification may occur and thus model must be updated
# for every misclassification. Thus, we will use the boosting techniques to encounter this.

In [None]:
# Final check of the X_train
X_train.info()

# 3. Model Creation

In [None]:
# Model 1 Creating a baseline model to compare other models results with.
X_train.columns

In [None]:
X_train.columns

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

print(metrics.roc_auc_score(y_test, y_pred))

In [None]:
lr_probs = lr.predict_proba(X_test)
preds = lr_probs[:,1]
fpr_lr, tpr_lr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr_lr, tpr_lr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr_lr, tpr_lr, label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Model 2 XGBoost Classifier

folds = 3

param_grid = {"learning_rate":[0.5, 0.6],
            "subsample":[0.6, 0.8],
            "n_estimators":[200, 300],
            "max_depth":[2]}          


xgb_clf = XGBClassifier()

xgb_cv = GridSearchCV(estimator = xgb_clf, 
                        param_grid = param_grid, 
                        scoring= 'roc_auc', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True) 

xgb_cv.fit(X_train, y_train)

In [None]:
xgb_cv.best_estimator_, xgb_cv.best_params_, xgb_cv.best_score_

In [None]:
pd.DataFrame(xgb_cv.cv_results_)

In [None]:
xgb_final = xgb_cv.best_estimator_

In [None]:
xgb_final.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgb_final.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred_xgb))

print(metrics.roc_auc_score(y_test, y_pred_xgb))

In [None]:
xg_probs = xgb_final.predict_proba(X_test)
xg_preds = xg_probs[:,1]
fpr_xg, tpr_xg, threshold = metrics.roc_curve(y_test, xg_preds)
roc_auc_xg = metrics.auc(fpr_xg, tpr_xg)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr_lr, tpr_lr, label = 'AUC_LR = %0.2f' % roc_auc)
plt.plot(fpr_xg, tpr_xg, 'b', label = 'AUC_XG = %0.2f' % roc_auc_xg, color='orange')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Model 3 : AdaBoost Classifier

folds = 3

param_grid = {"base_estimator__max_depth" : [2, 5],
              "n_estimators": [200, 400, 600]
             }


tree = DecisionTreeClassifier()

ada_clf = AdaBoostClassifier(base_estimator=tree, learning_rate=0.6, algorithm="SAMME")

ada_cv = GridSearchCV(estimator = ada_clf, 
                        param_grid = param_grid, 
                        scoring= 'roc_auc', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True) 

ada_cv.fit(X_train, y_train)


In [None]:
ada_cv.best_estimator_, ada_cv.best_params_, ada_cv.best_score_

In [None]:
ada_final = ada_cv.best_estimator_

ada_final.fit(X_train, y_train)

y_pred_ada = ada_final.predict(X_test)
print(metrics.classification_report(y_test, y_pred_ada))

print(metrics.roc_auc_score(y_test, y_pred_ada))

In [None]:
ada_probs = ada_final.predict_proba(X_test)
ada_preds = ada_probs[:,1]
fpr_ada, tpr_ada, threshold = metrics.roc_curve(y_test, ada_preds)
roc_auc_ada = metrics.auc(fpr_ada, tpr_ada)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr_lr, tpr_lr, label = 'AUC_LR = %0.2f' % roc_auc)
plt.plot(fpr_xg, tpr_xg, 'b', label = 'AUC_XG = %0.2f' % roc_auc_xg, color='orange')
plt.plot(fpr_ada, tpr_ada, 'b', label = 'AUC_ADA = %0.2f' % roc_auc_ada, color='green')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## AdaBoost Classifier gave the best result on the given problem, however XGBoost is relatively faster.

In [None]:
test_data = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/test.csv')


In [None]:
test_data.head()
test_data = fix_dataframe(test_data)
test_data.head()

In [None]:
test_data.head()

In [None]:
ip_count = test_data.groupby('ip').size().reset_index(name='ip_count').astype('int64')
df = pd.merge(test_data, ip_count, on='ip', how='left', sort=False)

In [None]:
df.head()

In [None]:
df.drop(columns=['click_id','ip','click_time'], inplace=True)

In [None]:
df.head()

In [None]:
preds_ada = ada_final.predict_proba(df)
submission = pd.DataFrame()
submission['click_id'] = test_data['click_id']
submission['is_attributed'] = preds_ada[:, 1]
submission.head()

In [None]:
submission.shape

In [None]:
# submission.to_csv('submission.csv', index = False)

In [None]:
# 18790469 