In [2]:
import xml.etree.ElementTree as ET
from tqdm import tqdm 
import pandas as pd

SMS_DATA = 'data/sms_conversation.xml'
CALL_DATA = 'data/nathans_calls.xml'

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [33]:
# get sms messages from xml
tree = ET.parse(SMS_DATA)
root = tree.getroot()

df = pd.DataFrame(columns=['protocol', 'address', 'date', 'type', 'subject', 'body', 'toa', 'sc_toa', 'service_center', 'read', 'status', 'locked', 'date_sent', 'sub_id', 'readable_date', 'contact_name'])
for child in tqdm(root):
    if child.tag == 'sms':
        df = pd.concat([df, pd.DataFrame([child.attrib])], ignore_index=True)

df['author'] = df['type'].apply(lambda x: 'Nathan' if x == '2' else 'Bernice')
df[['date', 'author', 'body', 'readable_date']].head(30)

df[['date', 'author', 'body', 'readable_date']].to_csv('parsed_data/sms_conversation.csv', index=False)

100%|██████████| 21819/21819 [00:23<00:00, 935.23it/s]


In [59]:
# get mms messages from xml
tree = ET.parse(SMS_DATA)
root = tree.getroot()

df = pd.DataFrame(columns=['date', 'spam_report', 'ct_t', 'msg_box', 'address', 'sub_cs', 're_type', 'retr_st', 're_original_body', 'd_tm', 'exp', 'locked', 'msg_id', 'app_id', 'from_address', 'm_id', 'retr_txt', 'date_sent', 'read', 'rpt_a', 'ct_cls', 'bin_info', 'pri', 'sub_id', 're_content_type', 'object_id', 'resp_txt', 're_content_uri', 'ct_l', 're_original_key', 'd_rpt', 'reserved', 'using_mode', '_id', 'rr_st', 'm_type', 'favorite', 'rr', 'sub', 'hidden', 'deletable', 'read_status', 'd_rpt_st', 'callback_set', 'seen', 're_recipient_address', 'device_name', 'cmc_prop', 'resp_st', 'text_only', 'sim_slot', 'st', 'retr_txt_cs', 'creator', 'm_size', 'sim_imsi', 'correlation_tag', 're_body', 'safe_message', 'tr_id', 'm_cls', 'v', 'secret_mode', 're_file_name', 're_count_info', 'readable_date', 'contact_name'])
for child in tqdm(root):
    if child.tag == 'mms':
        df = pd.concat([df, pd.DataFrame([child.attrib])], ignore_index=True)

df['author'] = df['msg_box'].apply(lambda x: 'Nathan' if x == '2' else 'Bernice')
df[['date', 'author', 'readable_date']].to_csv('parsed_data/mms_conversation.csv', index=False)


100%|██████████| 21819/21819 [00:00<00:00, 30056.04it/s] 


In [61]:
for column in df.columns:
    amount = df[column].nunique()
    if amount < 10:
        print(f'{column}: {df[column].unique()}')
    else:
        print(f'{column}: {df[column].nunique()} unique values')

date: 890 unique values
spam_report: ['0']
ct_t: ['application/vnd.wap.multipart.related'
 'application/vnd.wap.multipart.mixed']
msg_box: ['1' '2']
address: ['+14159198023']
sub_cs: ['null' '3' '106']
re_type: ['0']
retr_st: ['128' 'null']
re_original_body: ['null']
d_tm: ['null' '0']
exp: ['null' '604800']
locked: ['0']
msg_id: ['0']
app_id: ['0']
from_address: ['null']
m_id: 892 unique values
retr_txt: ['null']
date_sent: 584 unique values
read: ['1']
rpt_a: ['null']
ct_cls: ['null']
bin_info: ['0']
pri: ['null' '129']
sub_id: ['1' '-1']
re_content_type: ['null']
object_id: ['null']
resp_txt: ['null']
re_content_uri: ['null']
ct_l: 587 unique values
re_original_key: ['null']
d_rpt: ['null' '129' '128']
reserved: ['0']
using_mode: ['0']
_id: 892 unique values
rr_st: ['0']
m_type: ['132' '128']
favorite: ['0']
rr: ['null' '129']
sub: ['null' 'NoSubject' 'Fwd:']
hidden: ['0']
deletable: ['0']
read_status: ['null']
d_rpt_st: ['0']
callback_set: ['0']
seen: ['1']
re_recipient_address: ['

In [26]:
# get sms messages from xml
tree = ET.parse(CALL_DATA)
root = tree.getroot()
root[0].attrib.keys()

df = pd.DataFrame(columns=['number', 'duration', 'date', 'type', 'presentation', 'subscription_id', 'post_dial_digits', 'subscription_component_name', 'readable_date', 'contact_name'])
for child in tqdm(root):
    df = pd.concat([df, pd.DataFrame([child.attrib])], ignore_index=True)

types = {   # just for reference
    '1': 'Incoming',
    '2': 'Outgoing',
    '3': 'Missed',
    '5': 'Voicemail'
}

df = df[df['contact_name'] == 'Bernice Lau']
df['caller'] = df['type'].apply(lambda x: 'Nathan' if x in ('2', '5') else 'Bernice')
df['missed'] = df['type'].apply(lambda x: True if x in ('3', '5') else False)

df[['date', 'caller', 'duration', 'missed', 'readable_date']].to_csv('parsed_data/call_log.csv', index=False)
df[['date', 'caller', 'duration', 'missed', 'readable_date']].head(50)

100%|██████████| 791/791 [00:00<00:00, 4503.22it/s]


Unnamed: 0,date,caller,duration,missed,readable_date
464,1661713803824,Nathan,159,False,"Aug 28, 2022 12:10:03 PM"
466,1662691102852,Bernice,11,False,"Sep 8, 2022 7:38:22 PM"
468,1662954191491,Nathan,61,False,"Sep 11, 2022 8:43:11 PM"
475,1664249799488,Nathan,52,False,"Sep 26, 2022 8:36:39 PM"
488,1666239565201,Nathan,27,False,"Oct 19, 2022 9:19:25 PM"
489,1666239627231,Nathan,6,False,"Oct 19, 2022 9:20:27 PM"
504,1669175557695,Nathan,3233,False,"Nov 22, 2022 7:52:37 PM"
507,1669704002042,Nathan,13,False,"Nov 28, 2022 10:40:02 PM"
508,1669704104213,Nathan,8,False,"Nov 28, 2022 10:41:44 PM"
516,1670047572111,Nathan,93,False,"Dec 2, 2022 10:06:12 PM"
