### Creating nonblock corpus

In [1]:
# import necessary packages
import os
import pandas as pd
import numpy as np
import re

# set options
pd.options.display.max_colwidth = 50
pd.set_option('display.max_colwidth', -1) 
pd.options.mode.chained_assignment = None  # default='warn'
#2219

#### Process Blocks data 

In [4]:
# read ipblocks
df_ipblocks = pd.read_csv("/home/ec2-user/SageMaker/bucket/wiki_trust/ipblocks_fulldump_new.csv") 
df_ipblocks = df_ipblocks.drop(df_ipblocks.columns[0],axis=1)
df_ipblocks.shape

(1130485, 7)

In [5]:
# convert object dtype to string ; keep in mind expiry with timestamp will be unusable
df_ipblocks['ipb_address'] = df_ipblocks['ipb_address'].astype('str') #ipaddress -> string
df_ipblocks['date'] = pd.to_datetime(df_ipblocks['date'], format = "%Y%m%d")

# replace NAN with empty strings
df_ipblocks.ipb_reason = df_ipblocks.ipb_reason.replace(np.nan,'', regex=True)

In [7]:
# clean df
df_ipblocks = df_ipblocks.drop(columns = ['ipb_id','ipb_create_account','ipb_expiry','date','ipb_reason'])
df_ipblocks.columns = ['userid','username']

In [8]:
df_ipblocks.head()

Unnamed: 0,userid,username
0,9418,Wanli2
1,17254,Xuepiao
2,6959,GrandVoivodOfErdely
3,3107,#1703
4,43305,Wanli3


#### Process XML data

Split Contributor into UserID and Username

In [10]:
df_revtxt = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/xml_dump_processed/revision_text_data_final1.txt', sep = '\t')
df_revtxt.shape

(1683335, 8)

In [11]:
#df_revtxt.head(20)
#df_revtxt.NAMESPACE.value_counts()

# split CONTRIBUTOR into userid and username  
namesplit              = df_revtxt["CONTRIBUTOR"].str.split(",", n = 1, expand = True) 
df_revtxt["USERID"]    = namesplit[0] 
df_revtxt["username"]  = namesplit[1] 

# refine columns
df_revtxt["USERID"]    = df_revtxt["USERID"].str.replace("Contributor\(id=",'')
df_revtxt["USERID"]    = df_revtxt["USERID"].str.strip()

df_revtxt["username"]  = df_revtxt["username"].str.replace("user_text='",'')
df_revtxt["username"]  = df_revtxt["username"].str[:-2]
df_revtxt["username"]  = df_revtxt["username"].str.strip()

df_revtxt.head()

Unnamed: 0,NAMESPACE,CONTRIBUTOR,TITLE,PAGE_ID,REVISION_ID,TIMESTAMP,TEXT,DIFF_TEXT,USERID,username
0,1,"Contributor(id=205121, user_text='Koavf')","Talk:Stockdale High School (Stockdale, Texas)",35168384,483106491,20120321115934,{{WikiProject Texas}},,205121,Koavf
1,1,"Contributor(id=16675924, user_text='Kumioko')","Talk:Stockdale High School (Stockdale, Texas)",35168384,496151496,20120605183056,{{WikiProject United States|class=|importance=Low|TX=yes|TX-importance=Low}},+ {{WikiProject United States|class=|importance=Low|TX=yes|TX-importance=Low}},16675924,Kumioko
2,1,"Contributor(id=205121, user_text='Koavf')","Talk:Stonebridge, Texas",35168385,483106509,20120321115937,{{WikiProject Texas}},,205121,Koavf
3,1,"Contributor(id=16675924, user_text='Kumioko')","Talk:Stonebridge, Texas",35168385,496151525,20120605183113,{{WikiProject United States|class=|importance=Low|TX=yes|TX-importance=Low}},+ {{WikiProject United States|class=|importance=Low|TX=yes|TX-importance=Low}},16675924,Kumioko
4,1,"Contributor(id=205121, user_text='Koavf')","Talk:Stratford High School (Stratford, Texas)",35168388,483106582,20120321115952,{{WikiProject Texas}},,205121,Koavf


In [23]:
df_merge = df_ipblocks.merge(df_revtxt, indicator=True, how='outer')
df_data = df_merge[df_merge['_merge'] == 'right_only']
df_data.shape

(1549082, 12)

In [24]:
df_data.head()

Unnamed: 0,userid,username,NAMESPACE,CONTRIBUTOR,TITLE,PAGE_ID,REVISION_ID,TIMESTAMP,TEXT,DIFF_TEXT,USERID,_merge
1255521,,Koavf,1.0,"Contributor(id=205121, user_text='Koavf')","Talk:Stockdale High School (Stockdale, Texas)",35168384.0,483106491.0,20120320000000.0,{{WikiProject Texas}},,205121,right_only
1255522,,Koavf,1.0,"Contributor(id=205121, user_text='Koavf')","Talk:Stonebridge, Texas",35168385.0,483106509.0,20120320000000.0,{{WikiProject Texas}},,205121,right_only
1255523,,Koavf,1.0,"Contributor(id=205121, user_text='Koavf')","Talk:Stratford High School (Stratford, Texas)",35168388.0,483106582.0,20120320000000.0,{{WikiProject Texas}},,205121,right_only
1255524,,Koavf,1.0,"Contributor(id=205121, user_text='Koavf')",Talk:Sulphur Springs High School,35168391.0,483106652.0,20120320000000.0,{{WikiProject Texas}},,205121,right_only
1255525,,Koavf,1.0,"Contributor(id=205121, user_text='Koavf')",Talk:Sunray High School (Texas),35168394.0,483106726.0,20120320000000.0,{{WikiProject Texas}},,205121,right_only


In [21]:
#df_ipblocks[df_ipblocks['username']=="Dostoyevski1881"]

Unnamed: 0,userid,username


In [25]:
df_data['TIMESTAMP'] = df_data['TIMESTAMP'].astype('str') #timestamp -> string
df_data['TIMESTAMP'] = df_data['TIMESTAMP'].str[0:8]
df_data['TIMESTAMP'] = pd.to_datetime(df_data['TIMESTAMP'], format = "%Y%m%d")

In [26]:
df_data = df_data.drop(columns = ['userid','_merge','CONTRIBUTOR','PAGE_ID','REVISION_ID','TEXT'])
# check if userid doesnt match

In [30]:
df_data.tail()

Unnamed: 0,username,namespace,title,revision_date,text,userid
2804598,Dostoyevski1881,1.0,Talk:Human rights of Kurdish people in Turkey,2010-03-21,+ I think the main purpose of the user that added this article to wikipedia is not conribution.,11396525.0
2804599,Dostoyevski1881,1.0,Talk:Human rights of Kurdish people in Turkey,2010-03-21,+ I think the main purpose of the user that added this article to wikipedia is not contribution.,11396525.0
2804600,Dostoyevski1881,1.0,Talk:Human rights of Kurdish people in Turkey,2010-03-21,"+ I hope article will be rewritten by users that have true and completely objective information about Turkey. + --) 20:02, 21 March 2010 (UTC)",11396525.0
2804601,Dostoyevski1881,1.0,Talk:Human rights of Kurdish people in Turkey,2011-03-03,,11396525.0
2804602,95.10.67.174,1.0,Talk:Human rights of Kurdish people in Turkey,2015-04-24,"+ == language of instruction == + Kurdish is a language of instruction in Turkey after AK Party's reforms. + --[[Special:Contributions/95. + 10. + 67. + 174|95. + 10. + 67. + 174]] ([[User talk:95. + 10. + 67. + 174|talk]]) 01:00, 24 April 2015 (UTC)",


In [29]:
df_data = df_data.rename(columns={'NAMESPACE':'namespace','TIMESTAMP':'revision_date','DIFF_TEXT':'text','USERID': 'userid','TITLE':'title'})

In [31]:
df_data = df_data[['userid','username','revision_date','namespace','title','text']]
df_data.head()

Unnamed: 0,userid,username,revision_date,namespace,title,text
1255521,205121,Koavf,2012-03-21,1.0,"Talk:Stockdale High School (Stockdale, Texas)",
1255522,205121,Koavf,2012-03-21,1.0,"Talk:Stonebridge, Texas",
1255523,205121,Koavf,2012-03-21,1.0,"Talk:Stratford High School (Stratford, Texas)",
1255524,205121,Koavf,2012-03-21,1.0,Talk:Sulphur Springs High School,
1255525,205121,Koavf,2012-03-21,1.0,Talk:Sunray High School (Texas),


In [32]:
df_data = df_data.sort_values(by=['username','revision_date'], ascending=True)
df_data.head(20)

Unnamed: 0,userid,username,revision_date,namespace,title,text
2753557,126234,!!,2007-10-11,1.0,Talk:2007 ICC World Twenty20,"+ :See . + -- 11:56, 11 October 2007 (UTC)"
2753558,126234,!!,2007-10-21,3.0,User talk:Chaser,"+ :Thanks for implementing the update, which is the most important task. + There are various administrative tasks (adding talk page notification templates, archiving, updating the time template) but someone usually manages to clean up if they are left out. + -- 21:30, 21 October 2007 (UTC)"
2088812,3744755,!dea4u,2009-08-22,3.0,User talk:Chintu rohit,"+ <div style=""border-style:solid; border-color:blue; background-color:AliceBlue; border-width:1px; text-align:left; padding:8px;"" class=""plainlinks""> + '''Chintu rohit''', and hopefully this one has made your day better. + Spread the by smiling at someone else, whether it be someone you have had disagreements with in the past or a good friend. + Go on, smile! Cheers, and happy editing! <br /> <small>''Smile at others by adding {{tls|Smile}} to their talk page with a friendly message. + ''</small> + </div><!-- Template:smile -->"
2088811,3744755,!dea4u,2009-09-25,3.0,User talk:Shshshsh,"+ == Good Work - Keep it up == + <div style=""border-style:solid; border-color:blue; background-color:AliceBlue; border-width:1px; text-align:left; padding:8px;"" class=""plainlinks""> + '''Hello Shshshsh''', <small> and hopefully this one has made your day better. + Spread the by smiling at someone else, whether it be someone you have had disagreements with in the past or a good friend. + Go on, smile! Cheers, and happy editing! <br /> <small>''Smile at others by adding {{tls|Smile}} to their talk page with a friendly message. + ''</small> + </div><!-- Template:smile --> + Hi bro. + I found out your great creations & work, & highly appreciate all of them , Good Work. + ( <small></small> 16:22, 25 September 2009 (UTC))."
2088810,3744755,!dea4u,2012-05-17,3.0,User talk:Boseritwik,"+ ==== + A tag has been placed on requesting that it be speedily deleted from Wikipedia. + This has been done under infringement. + For legal reasons, we cannot accept copyrighted images or text borrowed from other web sites or printed material, and as a consequence, your addition will most likely be deleted. + Wikipedia takes copyright violations very seriously and persistent violators '''will be '''. + If you think that the page was nominated in error, contest the nomination by clicking on the button labelled ""Click here to contest this speedy deletion"" in the speedy deletion tag. + Doing so will take you to the talk page where you can explain why you believe the page should not be deleted. + You can also visit '''''' to give your reasons, but be aware that once a page is tagged for speedy deletion, it may be removed without delay. + Please do not remove the speedy deletion tag yourself, but do not hesitate to add information that is consistent with . + <!-- Template:Db-dbimgcopyvio-notice --> <!-- Template:Db-csd-notice-custom --> <small></small> 19:38, 17 May 2012 (UTC)"
2088807,3744755,!dea4u,2012-10-03,3.0,User talk:Theobeseplatoon,"+ == October 2012 == + . + I wanted to let you know that I undid one or more of because it didn't appear constructive. + If you think I made a mistake, or if you have any questions, you can leave me a message on . + <!-- Template:uw-vandalism1 --><!-- Template:uw-cluebotwarning1 --> <small></small> 19:11, 3 October 2012 (UTC)"
2088809,3744755,!dea4u,2014-11-07,3.0,User talk:195.77.232.186,"+ ==Recent edit to == + Thank you for your contribution to Wikipedia. + I noticed that you have posted content to the article in a language other than English. + When on the English-language Wikipedia, please always use English. + Thank you! <small></small> 10:13, 7 November 2014 (UTC)"
2088808,3744755,!dea4u,2014-11-13,3.0,User talk:193.235.35.30,"+ == November 2014 == + . + I wanted to let you know that I undid one or more of [[Special:Contributions/193. + 235. + 35. + 30|your recent contributions]]&nbsp;to because it did not appear constructive. + If you would like to experiment, please use the . + If you think I made a mistake, or if you have any questions, you can leave me a message on . + <!-- Template:uw-vandalism1 --> <small></small> 10:32, 13 November 2014 (UTC) + :''If this is a for yourself so you can avoid further irrelevant notices. + ''<!-- Template:Shared IP advice -->"
2319041,16826496,!nteresting,2012-05-15,3.0,User talk:!nteresting,
2724670,1848830,!paradigm!,2007-08-29,3.0,User talk:66.99.52.230,"+ <div style=""padding:5px; border:1px solid #c0c090; background-color:#FEC;"" class=""user-block""> }}. + Once the block has expired, you are welcome to make constructive contributions. + If you believe this block is unjustified, you may by adding the text <!-- Copy the text as it appears on your page, not as it appears in this edit area. + Do not include the ""nowiki"" tags. + --><nowiki>{{</nowiki>unblock|''your reason here''<nowiki>}}</nowiki><!-- Do not include the ""nowiki"" tags. + --> below. + {{{{{subst|}}}#if:{{{sig|}}}| 18:57, 29 August 2007 (UTC) <br clear=""both"">"


In [65]:
df_user = pd.DataFrame(df_data['username'].unique())
df_user.columns = ['username']
df_user.head()

Unnamed: 0,username
0,!!
1,!dea4u
2,!nteresting
3,!paradigm!
4,$trassenjunge


In [66]:
userlist = np.array(df_user['username'].sample(n=30000, random_state=1))

In [67]:
userlist

array(['GlobalTruthiness', '122.106.204.247', '69.205.53.187', ...,
       '75.179.153.110', '24.97.81.253', 'Dragonfiend'], dtype=object)

In [68]:
df_data2 = df_data.loc[df_data['username'].isin(userlist)]
df_data2.shape

(411596, 6)

In [69]:
df_data2['username'].nunique()

30000

In [70]:
df_data2.head(10)

Unnamed: 0,userid,username,revision_date,namespace,title,text
2088812,3744755,!dea4u,2009-08-22,3.0,User talk:Chintu rohit,"+ <div style=""border-style:solid; border-color:blue; background-color:AliceBlue; border-width:1px; text-align:left; padding:8px;"" class=""plainlinks""> + '''Chintu rohit''', and hopefully this one has made your day better. + Spread the by smiling at someone else, whether it be someone you have had disagreements with in the past or a good friend. + Go on, smile! Cheers, and happy editing! <br /> <small>''Smile at others by adding {{tls|Smile}} to their talk page with a friendly message. + ''</small> + </div><!-- Template:smile -->"
2088811,3744755,!dea4u,2009-09-25,3.0,User talk:Shshshsh,"+ == Good Work - Keep it up == + <div style=""border-style:solid; border-color:blue; background-color:AliceBlue; border-width:1px; text-align:left; padding:8px;"" class=""plainlinks""> + '''Hello Shshshsh''', <small> and hopefully this one has made your day better. + Spread the by smiling at someone else, whether it be someone you have had disagreements with in the past or a good friend. + Go on, smile! Cheers, and happy editing! <br /> <small>''Smile at others by adding {{tls|Smile}} to their talk page with a friendly message. + ''</small> + </div><!-- Template:smile --> + Hi bro. + I found out your great creations & work, & highly appreciate all of them , Good Work. + ( <small></small> 16:22, 25 September 2009 (UTC))."
2088810,3744755,!dea4u,2012-05-17,3.0,User talk:Boseritwik,"+ ==== + A tag has been placed on requesting that it be speedily deleted from Wikipedia. + This has been done under infringement. + For legal reasons, we cannot accept copyrighted images or text borrowed from other web sites or printed material, and as a consequence, your addition will most likely be deleted. + Wikipedia takes copyright violations very seriously and persistent violators '''will be '''. + If you think that the page was nominated in error, contest the nomination by clicking on the button labelled ""Click here to contest this speedy deletion"" in the speedy deletion tag. + Doing so will take you to the talk page where you can explain why you believe the page should not be deleted. + You can also visit '''''' to give your reasons, but be aware that once a page is tagged for speedy deletion, it may be removed without delay. + Please do not remove the speedy deletion tag yourself, but do not hesitate to add information that is consistent with . + <!-- Template:Db-dbimgcopyvio-notice --> <!-- Template:Db-csd-notice-custom --> <small></small> 19:38, 17 May 2012 (UTC)"
2088807,3744755,!dea4u,2012-10-03,3.0,User talk:Theobeseplatoon,"+ == October 2012 == + . + I wanted to let you know that I undid one or more of because it didn't appear constructive. + If you think I made a mistake, or if you have any questions, you can leave me a message on . + <!-- Template:uw-vandalism1 --><!-- Template:uw-cluebotwarning1 --> <small></small> 19:11, 3 October 2012 (UTC)"
2088809,3744755,!dea4u,2014-11-07,3.0,User talk:195.77.232.186,"+ ==Recent edit to == + Thank you for your contribution to Wikipedia. + I noticed that you have posted content to the article in a language other than English. + When on the English-language Wikipedia, please always use English. + Thank you! <small></small> 10:13, 7 November 2014 (UTC)"
2088808,3744755,!dea4u,2014-11-13,3.0,User talk:193.235.35.30,"+ == November 2014 == + . + I wanted to let you know that I undid one or more of [[Special:Contributions/193. + 235. + 35. + 30|your recent contributions]]&nbsp;to because it did not appear constructive. + If you would like to experiment, please use the . + If you think I made a mistake, or if you have any questions, you can leave me a message on . + <!-- Template:uw-vandalism1 --> <small></small> 10:32, 13 November 2014 (UTC) + :''If this is a for yourself so you can avoid further irrelevant notices. + ''<!-- Template:Shared IP advice -->"
2666931,2836063,$trassenjunge,2006-11-23,3.0,User talk:$trassenjunge,
2641224,27644245,(1)AnotherNewAccount,2016-04-11,1.0,Talk:Ostend–Bruges International Airport,+ {{WPAVIATION|Airports-project=yes|class=C + | b1 <!--Referencing & citations--> = no + | b2 <!--Coverage & accuracy --> = no + | b3 <!--Structure --> = yes + | b4 <!--Grammar & style --> = yes + | b5 <!--Supporting materials --> = yes + }} + {{WPBelgium|class=Start|importance=Low}}
2641225,27644245,(1)AnotherNewAccount,2016-04-11,1.0,Talk:Ostend–Bruges International Airport,+ {{sourcecheck|checked=true}}
2298120,15033274,(CA)Giacobbe,2012-05-11,1.0,Talk:Wide Awake (song),


In [71]:
# dump final dataframe to csv
header = ['userid','username','revision_date','namespace','title','text']
df_data2.to_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/xml_dump_processed/nonblockcorpus1.txt', sep = '\t',encoding='utf-8',header = True,index=False)