## Import

In [1]:
import os
import json

from ast import literal_eval

import pandas as pd
import numpy as np

## Constant

In [2]:
RAW_DIR = '../data/raw'
RAW_PATH = os.path.join(RAW_DIR, 'wiki_edits.json')

PROCESSED_DIR = '../data/processed'
PROCESSED_PATH = os.path.join(PROCESSED_DIR, 'wiki_edits.csv')


## Load data

In [3]:
with open(RAW_PATH) as json_file:
    data = json.load(json_file)

In [4]:
data_dict = [literal_eval(i.replace(": null", f": {None}").replace(": false", f": {False}").replace(": true", f": {True}")) for i in data]

data = pd.DataFrame(data_dict)
print(len(data))
data.head()

186430


Unnamed: 0,action,change_size,flags,hashtags,is_anon,is_bot,is_minor,is_new,is_unpatrolled,mentions,ns,page_title,parent_rev_id,parsed_summary,rev_id,section,summary,url,user,geo_ip
0,edit,1441.0,,[],False,False,False,False,False,[],User,User:Bluesymamal417/sandbox,1077811974,,1077292551,,,https://en.wikipedia.org/w/index.php?diff=1077...,Bluesymamal417,
1,edit,2.0,,[],False,False,False,False,False,[],Main,List of NAIA conferences,1077811976,,1077811715,,,https://en.wikipedia.org/w/index.php?diff=1077...,Santiago Claudio,
2,edit,11.0,MB,[],False,True,True,False,False,[],User talk,User talk:76.64.60.162,1077811977,Fixed [[WP:LINT|Lint errors]]. ([[User:Malnada...,427366561,,Fixed [[WP:LINT|Lint errors]]. ([[User:Malnada...,https://en.wikipedia.org/w/index.php?diff=1077...,MalnadachBot,
3,edit,-6.0,M,[],False,False,True,False,False,[],Main,Conrad Kennedy III,1077811978,v2.04b - Fix errors for [[WP:WCW|CW project]] ...,1073851861,,v2.04b - Fix errors for [[WP:WCW|CW project]] ...,https://en.wikipedia.org/w/index.php?diff=1077...,ZI Jony,
4,edit,4.0,M,[],False,False,True,False,False,[],Template,Template:2011–12 in Greek football,1077811979,,1077811868,,,https://en.wikipedia.org/w/index.php?diff=1077...,BEN917,


In [5]:
data['section'] = data['section'].replace('', np.nan)
data = data.fillna(value=np.nan)

## EDA

In [6]:
data.isnull().mean().sort_values(ascending=True)

action            0.000000
page_title        0.000000
ns                0.000000
user              0.000000
is_unpatrolled    0.000000
is_new            0.000000
mentions          0.000000
is_bot            0.000000
is_anon           0.000000
hashtags          0.000000
is_minor          0.000000
change_size       0.100274
rev_id            0.100274
url               0.100274
parent_rev_id     0.125334
parsed_summary    0.144097
summary           0.144097
flags             0.641517
section           0.725425
geo_ip            0.867747
dtype: float64

In [7]:
print(f'mentions: {(data["mentions"].str.len() == 0).sum()/len(data)}')

print(f'hashtags: {(data["hashtags"].str.len() == 0).sum()/len(data)}')

mentions: 0.9916000643673228
hashtags: 0.9855119884138819


In [8]:
categorical_cols=[]
numerical_cols = []
bool_cols = []
for variables in data.columns:
    if data[variables].dtype=='object':
        categorical_cols.append(variables)
    elif data[variables].dtype=='float64':
        numerical_cols.append(variables)
    elif data[variables].dtype=='bool':
        bool_cols.append(variables)
        
print(f'Number of categorical_cols: {len(categorical_cols)}')
print(f'{categorical_cols}')
print('\n')

print(f'Number of numerical_cols: {len(numerical_cols)}')
print(f'{numerical_cols}')
print('\n')

print(f'Number of bool_cols: {len(bool_cols)}')
print(f'{bool_cols}')
print('\n')

Number of categorical_cols: 14
['action', 'flags', 'hashtags', 'mentions', 'ns', 'page_title', 'parent_rev_id', 'parsed_summary', 'rev_id', 'section', 'summary', 'url', 'user', 'geo_ip']


Number of numerical_cols: 1
['change_size']


Number of bool_cols: 5
['is_anon', 'is_bot', 'is_minor', 'is_new', 'is_unpatrolled']




In [9]:
for col in data.columns:
    print(f'{col}:\n')
    print(f'{data[col].value_counts()}')
    print(f'\n')

action:

edit             167736
hit                5825
block              3270
create             2915
patrol             1086
delete             1063
reblock            1062
thank              1053
move                810
reviewed            746
upload              114
revision             93
delete_redir         90
move_redir           90
approve              81
tag                  79
overwrite            71
protect              53
byemail              42
renameuser           37
insert               36
create2              18
autopromote          11
restore              11
unprotect             6
modify                6
rights                5
config                4
unreviewed            3
unapprove             3
delete_redir2         2
unblock               2
revert                2
move_prot             2
reset                 1
event                 1
move_stable           1
Name: action, dtype: int64


change_size:

 0.0        11610
 14.0        6484
-18.0        4468
 1.0  

{'city': None, 'country_name': 'United States', 'latitude': 37.751, 'longitude': -97.822, 'region_name': None}                     1357
{'city': None, 'country_name': 'United Kingdom', 'latitude': 54.0, 'longitude': -2.0, 'region_name': None}                          389
{'city': None, 'country_name': 'India', 'latitude': 20.0, 'longitude': 77.0, 'region_name': None}                                   312
{'city': None, 'country_name': 'United Kingdom', 'latitude': 51.4964, 'longitude': -0.1224, 'region_name': None}                    272
{'city': 'Athens', 'country_name': 'Greece', 'latitude': 37.9842, 'longitude': 23.7353, 'region_name': 'Attica'}                    156
                                                                                                                                   ... 
{'city': 'Kimbolton', 'country_name': 'United Kingdom', 'latitude': 52.3325, 'longitude': -0.1844, 'region_name': 'England'}          1
{'city': 'Houston', 'country_name': 'United Stat

## Save data

In [10]:
for col in categorical_cols:
    data[col]=data[col].apply(lambda x: np.nan if x==np.nan else str(x).encode('utf-8', 'replace').decode('utf-8'))

In [11]:
data.to_csv(PROCESSED_PATH, index = False, encoding='utf-8')