### Creating corpus - combine revision text data with blocks data

In [1]:
# import necessary packages
import os
import pandas as pd
import numpy as np
import re

# set options
pd.options.display.max_colwidth = 50
pd.set_option('display.max_colwidth', -1) 
pd.options.mode.chained_assignment = None  # default='warn'
#2219

#### Process Blocks data 

In [2]:
# read ipblocks
df_ipblocks = pd.read_csv("/home/ec2-user/SageMaker/s3fs-fuse/bucket/wiki_trust/ipblocks_fulldump_new.csv") 
df_ipblocks = df_ipblocks.drop(df_ipblocks.columns[0],axis=1)
df_ipblocks.shape

(1130485, 7)

In [3]:
# convert object dtype to string ; keep in mind expiry with timestamp will be unusable
df_ipblocks['ipb_address'] = df_ipblocks['ipb_address'].astype('str') #ipaddress -> string
df_ipblocks['date'] = pd.to_datetime(df_ipblocks['date'], format = "%Y%m%d")

# replace NAN with empty strings
df_ipblocks.ipb_reason = df_ipblocks.ipb_reason.replace(np.nan,'', regex=True)

#### Reg Ex extraction of block keywords

In [4]:
key1 = df_ipblocks['ipb_reason'].str.extractall(r'\[\[WP:(.*?)\]\]').unstack().apply(lambda x:','.join(x.dropna()), axis=1)
key2 = df_ipblocks['ipb_reason'].str.extractall(r'{{(.*)}}').unstack().apply(lambda x:','.join(x.dropna()), axis=1)

df_ipblocks = pd.concat([df_ipblocks,key1,key2], axis=1)

In [5]:
df_ipblocks["key3"] = np.nan
df_ipblocks["key3"][df_ipblocks['ipb_reason'].str.contains("\[\[User:.*\]\]",regex=True)] = "Sock Puppetry"

In [6]:
# to get word count list 
#blocks = df_ipblocks[df_ipblocks['ipb_reason'].notnull()]

#import re
#def pre_process(text):
    
    # lowercase
 #   text=text.lower()
    # remove special characters and digits
  #  text=re.sub("(\\d|\\W)+"," ",text)
    
   # return text
 
#blocks['ipb_reason'] = blocks['ipb_reason'].apply(lambda x:pre_process(x))


#textlist = blocks['ipb_reason']
#wordlist = [word for line in textlist for word in line.split() if word not in stoplist]
#from collections import Counter
#counts = Counter(wordlist)
#print(counts)

#with open('/home/ec2-user/SageMaker/s3fs-fuse/bucket/wiki_trust/blockswordcount.csv', 'w',newline='') as csv_file:
 #   writer = csv.writer(csv_file)
  #  for key, value in counts.items():
   #     writer.writerow([key, value])

#### Extract keywords from ipb_reason based on list of known words

In [7]:
df_ipblocks.head()

Unnamed: 0,ipb_id,ipb_user,ipb_address,ipb_create_account,ipb_expiry,date,ipb_reason,0,1,key3
0,1623,9418,Wanli2,1,inf,2004-02-08,"Obvious reincarnation of [[User:Wanli]], banned months ago for using Wikipedia as a personal file-storage area",,,Sock Puppetry
1,1624,17254,Xuepiao,1,inf,2004-02-08,"Reincarnation of [[User:Wanli]], banned months ago for using Wikipedia as a personal file-storage area",,,Sock Puppetry
2,1625,6959,GrandVoivodOfErdely,1,inf,2004-02-08,"Probable reincarnation of banned [[User:Wanli]], and no edits except for using Wikipedia as a personal file-storage area in any case.",,,Sock Puppetry
3,1706,3107,#1703,1,inf,2004-02-13,testing,,,
4,2616,43305,Wanli3,1,inf,2004-03-14,"Reincarnation of [[User:Wanli]], banned months ago for using Wikipedia as a personal file-storage area",,,Sock Puppetry


In [8]:
# clean df
df_ipblocks = df_ipblocks.drop(columns = ['ipb_id','ipb_create_account','ipb_expiry'])
df_ipblocks.columns = ['userid','username','blockdate','reason','k1','k2','k3']

In [9]:
df_ipblocks.k1 = df_ipblocks.k1.replace(np.nan,'', regex=True)
df_ipblocks.k2 = df_ipblocks.k2.replace(np.nan,'', regex=True)
df_ipblocks.k3 = df_ipblocks.k3.replace(np.nan,'', regex=True)
cols = ['k1', 'k2', 'k3']
df_ipblocks['keyreason'] = df_ipblocks[cols].apply(lambda row: ','.join(row.values.astype(str)), axis=1)
df_ipblocks.keyreason = df_ipblocks.keyreason.replace(',,',',', regex=True)
df_ipblocks.keyreason = df_ipblocks.keyreason.replace(r'^,','', regex=True)
df_ipblocks = df_ipblocks.drop(columns = ['k1','k2','k3'])

In [10]:
df_ipblocks.head()

Unnamed: 0,userid,username,blockdate,reason,keyreason
0,9418,Wanli2,2004-02-08,"Obvious reincarnation of [[User:Wanli]], banned months ago for using Wikipedia as a personal file-storage area",Sock Puppetry
1,17254,Xuepiao,2004-02-08,"Reincarnation of [[User:Wanli]], banned months ago for using Wikipedia as a personal file-storage area",Sock Puppetry
2,6959,GrandVoivodOfErdely,2004-02-08,"Probable reincarnation of banned [[User:Wanli]], and no edits except for using Wikipedia as a personal file-storage area in any case.",Sock Puppetry
3,3107,#1703,2004-02-13,testing,
4,43305,Wanli3,2004-03-14,"Reincarnation of [[User:Wanli]], banned months ago for using Wikipedia as a personal file-storage area",Sock Puppetry


In [11]:
#df_ipblocks[df_ipblocks['reason'].str.contains("\[\[User:.*\]\]",regex=True)]

#### Process XML data

Split Contributor into UserID and Username

In [12]:
df_revtxt = pd.read_csv('/home/ec2-user/SageMaker/s3fs-fuse/bucket/wiki_trust/xml_dump_processed/revision_text_data_final3.txt', sep = '\t')
df_revtxt.shape

(892010, 8)

In [13]:
#df_revtxt.head(20)
#df_revtxt.NAMESPACE.value_counts()

# split CONTRIBUTOR into userid and username  
namesplit              = df_revtxt["CONTRIBUTOR"].str.split(",", n = 1, expand = True) 
df_revtxt["USERID"]    = namesplit[0] 
df_revtxt["username"]  = namesplit[1] 

# refine columns
df_revtxt["USERID"]    = df_revtxt["USERID"].str.replace("Contributor\(id=",'')
df_revtxt["USERID"]    = df_revtxt["USERID"].str.strip()

df_revtxt["username"]  = df_revtxt["username"].str.replace("user_text='",'')
df_revtxt["username"]  = df_revtxt["username"].str[:-2]
df_revtxt["username"]  = df_revtxt["username"].str.strip()

df_revtxt.head()

Unnamed: 0,NAMESPACE,CONTRIBUTOR,TITLE,PAGE_ID,REVISION_ID,TIMESTAMP,TEXT,DIFF_TEXT,USERID,username
0,1,"Contributor(id=1662958, user_text='Kleomarlo')",Talk:Philippine habeas corpus cases,19436942,240406734,20080923101045,{{WPPhilippines|class=Start|importance=High}},,1662958,Kleomarlo
1,1,"Contributor(id=1662958, user_text='Kleomarlo')",Talk:Philippine Heart Center,19436944,240406770,20080923101100,{{WPPhilippines|class=Stub|importance=Mid}},,1662958,Kleomarlo
2,1,"Contributor(id=9699525, user_text='Hariboneagle927')",Talk:Philippine Heart Center,19436944,577410606,20131016103933,{{WikiProject Hospitals |class =stub |importance = |attention = |needs-infobox =yes |needs-photo = |needs-coord =yes }}\n{{WPPhilippines|class=Stub|importance=Mid}},+ {{WikiProject Hospitals |class =stub |importance = |attention = |needs-infobox =yes |needs-photo = |needs-coord =yes }},9699525,Hariboneagle927
3,1,"Contributor(id=1662958, user_text='Kleomarlo')",Talk:Philippine hawk-cuckoo,19436945,240406781,20080923101105,{{WPPhilippines|class=Stub|importance=Mid}},,1662958,Kleomarlo
4,1,"Contributor(id=5798, user_text='Shyamal')",Talk:Philippine hawk-cuckoo,19436945,407446390,20110112091919,{{WPPhilippines|class=Stub|importance=Mid}},,5798,Shyamal


In [14]:
df_data = pd.merge(df_revtxt, df_ipblocks, on=['username'], how='inner')

In [15]:
df_data.shape

(53627, 14)

In [17]:
df_data['TIMESTAMP'] = df_data['TIMESTAMP'].astype('str') #timestamp -> string
df_data['TIMESTAMP'] = df_data['TIMESTAMP'].str[0:8]
df_data['TIMESTAMP'] = pd.to_datetime(df_data['TIMESTAMP'], format = "%Y%m%d")

In [18]:
df_data = df_data.drop(columns = ['CONTRIBUTOR','PAGE_ID','REVISION_ID','TEXT','userid'])
# check if userid doesnt match

In [19]:
df_data.head()

Unnamed: 0,NAMESPACE,TITLE,TIMESTAMP,DIFF_TEXT,USERID,username,blockdate,reason,keyreason
0,1,Talk:Boston College School of Theology and Ministry,2012-02-01,+ {{WikiProject Catholicism}} + {{WikiProject Boston}} + {{WikiProject Massachusetts|class=Start|importance=|auto=inherit}} + {{WikiProject Schools|class=start|importance=low|needs-infobox=yes}},15929118,Kumi-Taskbot,2012-04-10,Bot operator has abandoned his account and is now editing as an IP,
1,1,Talk:Boston College School of Theology and Ministry,2012-02-02,+ {{WikiProject United States|class=Start|importance=|MA=yes|MA-importance=|auto=inherit}},15929118,Kumi-Taskbot,2012-04-10,Bot operator has abandoned his account and is now editing as an IP,
2,1,Talk:Boston College School of Theology and Ministry,2012-02-09,+ {{WikiProject United States|class=|importance=|Boston=yes|Boston-importance=|MA=yes|MA-importance=|auto=inherit}},15929118,Kumi-Taskbot,2012-04-10,Bot operator has abandoned his account and is now editing as an IP,
3,1,Talk:Bob Hite (announcer),2011-12-26,+ {{WikiProjectBannerShell|1= + {{WikiProject United States|class=Start|importance=Low|IN=yes|IN-importance=Low||needs-infobox=yes}} + {{WikiProject Biography + }},15929118,Kumi-Taskbot,2012-04-10,Bot operator has abandoned his account and is now editing as an IP,
4,1,Talk:Chihuahuan Desert Research Institute,2012-02-14,+ {{WikiProject United States|class=Start|importance=Mid|TX=yes|TX-importance=Mid| }},15929118,Kumi-Taskbot,2012-04-10,Bot operator has abandoned his account and is now editing as an IP,


In [20]:
df_data = df_data.rename(columns={'NAMESPACE':'namespace','TIMESTAMP':'revision_date','DIFF_TEXT':'text','USERID': 'userid','TITLE':'title'})

In [21]:
df_data = df_data[['userid','username','revision_date','namespace','title','text','blockdate','keyreason','reason']]
df_data.head()

Unnamed: 0,userid,username,revision_date,namespace,title,text,blockdate,keyreason,reason
0,15929118,Kumi-Taskbot,2012-02-01,1,Talk:Boston College School of Theology and Ministry,+ {{WikiProject Catholicism}} + {{WikiProject Boston}} + {{WikiProject Massachusetts|class=Start|importance=|auto=inherit}} + {{WikiProject Schools|class=start|importance=low|needs-infobox=yes}},2012-04-10,,Bot operator has abandoned his account and is now editing as an IP
1,15929118,Kumi-Taskbot,2012-02-02,1,Talk:Boston College School of Theology and Ministry,+ {{WikiProject United States|class=Start|importance=|MA=yes|MA-importance=|auto=inherit}},2012-04-10,,Bot operator has abandoned his account and is now editing as an IP
2,15929118,Kumi-Taskbot,2012-02-09,1,Talk:Boston College School of Theology and Ministry,+ {{WikiProject United States|class=|importance=|Boston=yes|Boston-importance=|MA=yes|MA-importance=|auto=inherit}},2012-04-10,,Bot operator has abandoned his account and is now editing as an IP
3,15929118,Kumi-Taskbot,2011-12-26,1,Talk:Bob Hite (announcer),+ {{WikiProjectBannerShell|1= + {{WikiProject United States|class=Start|importance=Low|IN=yes|IN-importance=Low||needs-infobox=yes}} + {{WikiProject Biography + }},2012-04-10,,Bot operator has abandoned his account and is now editing as an IP
4,15929118,Kumi-Taskbot,2012-02-14,1,Talk:Chihuahuan Desert Research Institute,+ {{WikiProject United States|class=Start|importance=Mid|TX=yes|TX-importance=Mid| }},2012-04-10,,Bot operator has abandoned his account and is now editing as an IP


In [22]:
# dump final dataframe to csv
header = ['userid','username','revision_date','namespace','title','text','blockdate','keyreason','reason']
df_data.to_csv('/home/ec2-user/SageMaker/s3fs-fuse/bucket/wiki_trust/xml_dump_processed/blockcorpus3.txt', sep = '\t',encoding='utf-8',header = True,index=False)

#### Combine files into 1 block corpus

In [24]:
file_list = [x for x in os.listdir("/home/ec2-user/SageMaker/s3fs-fuse/bucket/wiki_trust/xml_dump_processed/") if x.startswith("blockcorpus")]

df_list = []
for file in file_list:
    print(file)
    df_list.append(pd.read_csv('/home/ec2-user/SageMaker/s3fs-fuse/bucket/wiki_trust/xml_dump_processed/' + file, sep = '\t'))

big_df = pd.concat(df_list)

# save file as .csv 
header = ['userid','username','revision_date','namespace','title','text','blockdate','keyreason','reason']
big_df.to_csv('/home/ec2-user/SageMaker/s3fs-fuse/bucket/wiki_trust/xml_dump_processed/blockcorpus.txt', sep = '\t',encoding='utf-8',header = True,index=False)

blockcorpus1.txt
blockcorpus2.txt
blockcorpus3.txt


#### Read Corpus

In [25]:
df_blockcorpus = pd.read_csv('/home/ec2-user/SageMaker/s3fs-fuse/bucket/wiki_trust/xml_dump_processed/blockcorpus.txt', sep = '\t')
df_blockcorpus.shape

(286918, 9)

In [26]:
df_blockcorpus.head()

Unnamed: 0,userid,username,revision_date,namespace,title,text,blockdate,keyreason,reason
0,16675924,Kumioko,2012-06-05,1,"Talk:Stockdale High School (Stockdale, Texas)",+ {{WikiProject United States|class=|importance=Low|TX=yes|TX-importance=Low}},2014-06-01,,User has been banned by the community - see [[User talk:KumiokoCleanStart]] for more information.<!-- Soft blocking this account so it's listed and recorded as being blocked -->
1,16675924,Kumioko,2012-06-05,1,"Talk:Stonebridge, Texas",+ {{WikiProject United States|class=|importance=Low|TX=yes|TX-importance=Low}},2014-06-01,,User has been banned by the community - see [[User talk:KumiokoCleanStart]] for more information.<!-- Soft blocking this account so it's listed and recorded as being blocked -->
2,16675924,Kumioko,2012-06-05,1,"Talk:Stratford High School (Stratford, Texas)",+ {{WikiProject United States|class=|importance=Low|TX=yes|TX-importance=Low}},2014-06-01,,User has been banned by the community - see [[User talk:KumiokoCleanStart]] for more information.<!-- Soft blocking this account so it's listed and recorded as being blocked -->
3,16675924,Kumioko,2012-06-05,1,Talk:Sulphur Springs High School,+ {{WikiProject United States|class=|importance=Low|TX=yes|TX-importance=Low}},2014-06-01,,User has been banned by the community - see [[User talk:KumiokoCleanStart]] for more information.<!-- Soft blocking this account so it's listed and recorded as being blocked -->
4,16675924,Kumioko,2012-10-12,1,Talk:Sulphur Springs High School,+ {{WikiProject United States|class=Stub|importance=Low|TX=yes|TX-importance=Low}},2014-06-01,,User has been banned by the community - see [[User talk:KumiokoCleanStart]] for more information.<!-- Soft blocking this account so it's listed and recorded as being blocked -->


In [27]:
df_blockcorpus.namespace.value_counts()

1    143583
3    143335
Name: namespace, dtype: int64

In [29]:
df_blockcorpus.username.nunique()

18330

In [32]:
# list of blocked users
df_name = df_blockcorpus.username.unique()
df_name = pd.DataFrame(df_name)
# save file as .csv 
header = ['username']
df_name.to_csv('/home/ec2-user/SageMaker/s3fs-fuse/bucket/wiki_trust/xml_dump_processed/blockuserlist.txt', sep = '\t',encoding='utf-8',header = True,index=False)