In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [5]:
df = pd.read_csv("../../Data/wa/Data gede masked.csv", parse_dates=['Date','Last Update Time'])
df.head()

Unnamed: 0,Email User,Nomor Telepon Tujuan,Status,Date,Last Update Time
0,Email user 1,0821xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:14:00
1,Email user 1,0812xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:17:00
2,Email user 1,0838xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:15:00
3,Email user 1,0858xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:28:00
4,Email user 1,0858xxxxxxxx,delivered,2023-02-14 11:13:00,2023-02-14 11:13:00


In [6]:
dateLU = df['Last Update Time'].dt

df["jam last update"] = dateLU.hour;
df["tanggal last update"] = dateLU.day;
df["hari last update"] = dateLU.weekday + 1;
df["isWeekend last update"] = (df['hari last update'] >= 6).astype(int);

date = df.Date.dt

df["jam"] = date.hour
df["tanggal"] = date.day
df["hari"] = date.weekday + 1
df["isWeekend last update"] = (df['hari'] >= 6).astype(int);


mapping_hari = {
    1: "Senin",
    2: "Selasa",
    3: "Rabu",
    4: "Kamis",
    5: "Jumat",
    6: "Sabtu",
    7: "Minggu",
}

df['hari last update'] = df['hari last update'].map(mapping_hari);
df['hari'] = df['hari'].map(mapping_hari);
df['hari last update'].value_counts()

Rabu      3944624
Sabtu     2898298
Kamis     2315193
Selasa    2042504
Jumat     1869014
Senin     1688471
Minggu    1306800
Name: hari last update, dtype: int64

In [7]:
df['Nomor Telepon Tujuan Prefix'] = df['Nomor Telepon Tujuan'].str.slice(0, 4);
df

Unnamed: 0,Email User,Nomor Telepon Tujuan,Status,Date,Last Update Time,jam last update,tanggal last update,hari last update,isWeekend last update,jam,tanggal,hari,Nomor Telepon Tujuan Prefix
0,Email user 1,0821xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:14:00,11,14,Selasa,0,11,14,Selasa,0821
1,Email user 1,0812xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:17:00,11,14,Selasa,0,11,14,Selasa,0812
2,Email user 1,0838xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:15:00,11,14,Selasa,0,11,14,Selasa,0838
3,Email user 1,0858xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:28:00,11,14,Selasa,0,11,14,Selasa,0858
4,Email user 1,0858xxxxxxxx,delivered,2023-02-14 11:13:00,2023-02-14 11:13:00,11,14,Selasa,0,11,14,Selasa,0858
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16064899,Email user 238,0811xxxxxxxx,submitted,2023-03-28 09:06:00,2023-05-10 13:38:00,13,10,Rabu,0,9,28,Selasa,0811
16064900,Email user 238,0813xxxxxxxx,submitted,2023-03-28 09:06:00,2023-05-10 13:38:00,13,10,Rabu,0,9,28,Selasa,0813
16064901,Email user 238,0812xxxxxxxx,submitted,2023-03-28 08:57:00,2023-05-10 13:38:00,13,10,Rabu,0,8,28,Selasa,0812
16064902,Email user 238,91902xxxxxxxx,submitted,2023-03-28 09:06:00,2023-05-10 13:38:00,13,10,Rabu,0,9,28,Selasa,9190


In [8]:
def provider(df):
    
    if (df['Nomor Telepon Tujuan Prefix'] == '0813' 
        or df['Nomor Telepon Tujuan Prefix'] == '0812'
        or df['Nomor Telepon Tujuan Prefix'] == '0821'
        or df['Nomor Telepon Tujuan Prefix'] == '0852'
        or df['Nomor Telepon Tujuan Prefix'] == '0822'
        or df['Nomor Telepon Tujuan Prefix'] == '0823'
        or df['Nomor Telepon Tujuan Prefix'] == '0853'
        or df['Nomor Telepon Tujuan Prefix'] == '0811'
        or df['Nomor Telepon Tujuan Prefix'] == '0851'
        ):
        return 'Telkomsel'
    elif(df['Nomor Telepon Tujuan Prefix'] == '0857' 
        or df['Nomor Telepon Tujuan Prefix'] == '0858'
        or df['Nomor Telepon Tujuan Prefix'] == '0856'
        or df['Nomor Telepon Tujuan Prefix'] == '0815'
        or df['Nomor Telepon Tujuan Prefix'] == '0816'
        or df['Nomor Telepon Tujuan Prefix'] == '0855'
        # or df['Nomor Telepon Tujuan Prefix'] == '0850'
        or df['Nomor Telepon Tujuan Prefix'] == '0814'
        ):
        return 'Indosat'
    elif(df['Nomor Telepon Tujuan Prefix'] == '0895' 
        or df['Nomor Telepon Tujuan Prefix'] == '0896'
        or df['Nomor Telepon Tujuan Prefix'] == '0899'
        or df['Nomor Telepon Tujuan Prefix'] == '0898'
        or df['Nomor Telepon Tujuan Prefix'] == '0897'
        or df['Nomor Telepon Tujuan Prefix'] == '0890'
        ):
        return 'Tri'
    elif(df['Nomor Telepon Tujuan Prefix'] == '0878' 
        or df['Nomor Telepon Tujuan Prefix'] == '0877'
        or df['Nomor Telepon Tujuan Prefix'] == '0838'
        or df['Nomor Telepon Tujuan Prefix'] == '0819'
        or df['Nomor Telepon Tujuan Prefix'] == '0818'
        or df['Nomor Telepon Tujuan Prefix'] == '0859'
        or df['Nomor Telepon Tujuan Prefix'] == '0817'
        ):
        return 'XL'
    elif(df['Nomor Telepon Tujuan Prefix'] == '0838' 
        or df['Nomor Telepon Tujuan Prefix'] == '0831'
        or df['Nomor Telepon Tujuan Prefix'] == '0832'
        ):
        return 'Axis'
    elif(df['Nomor Telepon Tujuan Prefix'] == '0882' 
        or df['Nomor Telepon Tujuan Prefix'] == '0831'
        or df['Nomor Telepon Tujuan Prefix'] == '0881'
        or df['Nomor Telepon Tujuan Prefix'] == '0888'
        or df['Nomor Telepon Tujuan Prefix'] == '0889'
        or df['Nomor Telepon Tujuan Prefix'] == '0887'
        ):
        return 'Smartfren'
    else:
        return 'other'

df['provider'] = df.apply(provider, axis = 1)
df.head()

Unnamed: 0,Email User,Nomor Telepon Tujuan,Status,Date,Last Update Time,jam last update,tanggal last update,hari last update,isWeekend last update,jam,tanggal,hari,Nomor Telepon Tujuan Prefix,provider
0,Email user 1,0821xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:14:00,11,14,Selasa,0,11,14,Selasa,821,Telkomsel
1,Email user 1,0812xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:17:00,11,14,Selasa,0,11,14,Selasa,812,Telkomsel
2,Email user 1,0838xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:15:00,11,14,Selasa,0,11,14,Selasa,838,XL
3,Email user 1,0858xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:28:00,11,14,Selasa,0,11,14,Selasa,858,Indosat
4,Email user 1,0858xxxxxxxx,delivered,2023-02-14 11:13:00,2023-02-14 11:13:00,11,14,Selasa,0,11,14,Selasa,858,Indosat


In [9]:
df['selisih'] = df['Last Update Time'] - df['Date']
df

Unnamed: 0,Email User,Nomor Telepon Tujuan,Status,Date,Last Update Time,jam last update,tanggal last update,hari last update,isWeekend last update,jam,tanggal,hari,Nomor Telepon Tujuan Prefix,provider,selisih
0,Email user 1,0821xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:14:00,11,14,Selasa,0,11,14,Selasa,0821,Telkomsel,0 days 00:01:00
1,Email user 1,0812xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:17:00,11,14,Selasa,0,11,14,Selasa,0812,Telkomsel,0 days 00:04:00
2,Email user 1,0838xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:15:00,11,14,Selasa,0,11,14,Selasa,0838,XL,0 days 00:02:00
3,Email user 1,0858xxxxxxxx,read,2023-02-14 11:13:00,2023-02-14 11:28:00,11,14,Selasa,0,11,14,Selasa,0858,Indosat,0 days 00:15:00
4,Email user 1,0858xxxxxxxx,delivered,2023-02-14 11:13:00,2023-02-14 11:13:00,11,14,Selasa,0,11,14,Selasa,0858,Indosat,0 days 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16064899,Email user 238,0811xxxxxxxx,submitted,2023-03-28 09:06:00,2023-05-10 13:38:00,13,10,Rabu,0,9,28,Selasa,0811,Telkomsel,43 days 04:32:00
16064900,Email user 238,0813xxxxxxxx,submitted,2023-03-28 09:06:00,2023-05-10 13:38:00,13,10,Rabu,0,9,28,Selasa,0813,Telkomsel,43 days 04:32:00
16064901,Email user 238,0812xxxxxxxx,submitted,2023-03-28 08:57:00,2023-05-10 13:38:00,13,10,Rabu,0,8,28,Selasa,0812,Telkomsel,43 days 04:41:00
16064902,Email user 238,91902xxxxxxxx,submitted,2023-03-28 09:06:00,2023-05-10 13:38:00,13,10,Rabu,0,9,28,Selasa,9190,other,43 days 04:32:00


In [10]:
df.to_csv('../../Data/wa/Data gede masked olahan.csv', index=False)