### Create train corpus for prediction  (block + nonblocked users)

* Separately for blocked and non-blocked users 
* Combine abuse score  + ORES score data
* Aggregate daily activity data for blocked users 
* Aggregate daiy acitivity data for nonblocked users
* Combine activity data with abuse score and ORES data

In [1]:
# import necessary packages
import os
import pandas as pd
import numpy as np
import re

# set options
pd.options.display.max_colwidth = 50
pd.set_option('display.max_colwidth', -1) 
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# load abuse score file to extract max revids for all users
df_abuse = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/modeling/detection_user_level_pred_03_02.txt',sep = '\t')
df_abuse.drop(columns = ['Unnamed: 0', 'index', 'char_changes', 'revision_date',
       'text', 'bl', 'occurance', 'bl_date', 'doi', 'valid_dt',
       'clen', 'numb', 'caps', 'caps_ncaps', 'wordlen', 'schar',
       'unique_wlen_percent', 'clen_wlen', 'neg', 'neu', 'compound'],inplace = True)
df_abuse = df_abuse.sort_values(by=['username','rev_id'])
df_abuse['sequence'] = df_abuse.groupby('username').cumcount(ascending=False)

In [3]:
df_abuse.head(10)

Unnamed: 0,username,rev_id,abuse_score,sequence
6,! Bikkit !,677901471,0.110988,6
5,! Bikkit !,678070719,0.299359,5
4,! Bikkit !,680954373,0.129014,4
3,! Bikkit !,681751098,0.086982,3
2,! Bikkit !,695177648,0.150583,2
1,! Bikkit !,695241872,0.40134,1
0,! Bikkit !,695331950,0.18658,0
19,!MNc99,239593352,0.536049,12
18,!MNc99,248068700,0.341826,11
17,!MNc99,257252965,0.373314,10


In [4]:
df_minrevid = df_abuse.loc[df_abuse.groupby(["username"])["sequence"].idxmax()]  
df_minrevid.drop(columns = ['abuse_score','sequence'],inplace = True)
df_minrevid.head()

# save file as .csv
df_minrevid.to_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/userminrev.txt', sep = '\t',encoding='utf-8',header = True,index=False)

In [5]:
df_minrevid.shape
# 21k users approx in both train and test sets

(21418, 2)

#### ORES + Abuse

In [6]:
# Combine all ores files
file_list = [x for x in os.listdir("/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/csvs_stored/Ores/Data/") if x.endswith(".csv")]

df_list = []
for file in file_list:
    print(file)
    df_list.append(pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/csvs_stored/Ores/Data/' + file))
    
df_ores = pd.concat(df_list)
df_ores.drop(columns = ['Unnamed: 0'],inplace = True)
df_ores.shape

ores_rev_1.csv
ores_rev_2.csv
ores_rev_3.csv
ores_rev_4.csv
ores_rev_5.csv
ores_rev_6.csv
ores_rev_7.csv


(319512, 4)

In [7]:
df_abuse.head()

Unnamed: 0,username,rev_id,abuse_score,sequence
6,! Bikkit !,677901471,0.110988,6
5,! Bikkit !,678070719,0.299359,5
4,! Bikkit !,680954373,0.129014,4
3,! Bikkit !,681751098,0.086982,3
2,! Bikkit !,695177648,0.150583,2


In [8]:
df_ores.head()

Unnamed: 0,username,rev_id,damaging,goodFaith
0,$targlass,821350201,0.030022,0.983378
1,'lllllllllllllllllllllllllllllll',831073982,0.025048,0.981712
2,'lllllllllllllllllllllllllllllll',831059743,0.075616,0.971056
3,'lllllllllllllllllllllllllllllll',831059095,0.09447,0.956036
4,(127.0.0.1),842665329,0.024231,0.983864


In [9]:
# merge ores scores with abuse data
# this data is unique at revision id level
df_abuse_ores = pd.merge(df_abuse,df_ores,how="left",on=["rev_id"])
df_abuse_ores.drop(columns = ['username_y'],inplace = True)
df_abuse_ores.shape

(88988, 6)

In [10]:
df_abuse_ores.columns = ['username','rev_id','abuse_score','sequence','damage_score','goodfaith_score']
df_abuse_ores.head(10)

Unnamed: 0,username,rev_id,abuse_score,sequence,damage_score,goodfaith_score
0,! Bikkit !,677901471,0.110988,6,0.01155,0.986303
1,! Bikkit !,678070719,0.299359,5,0.010483,0.989152
2,! Bikkit !,680954373,0.129014,4,0.009104,0.991462
3,! Bikkit !,681751098,0.086982,3,0.012416,0.991418
4,! Bikkit !,695177648,0.150583,2,0.009583,0.992495
5,! Bikkit !,695241872,0.40134,1,0.007731,0.994983
6,! Bikkit !,695331950,0.18658,0,0.00772,0.995743
7,!MNc99,239593352,0.536049,12,0.005802,0.996102
8,!MNc99,248068700,0.341826,11,0.038819,0.985934
9,!MNc99,257252965,0.373314,10,0.016718,0.993159


In [11]:
# removing most recent scores (rev_id, sequence = 0)
minrevlist = df_minrevid['rev_id']
df_abuse_ores_excl = df_abuse_ores[~df_abuse_ores['rev_id'].isin(minrevlist)]
df_abuse_ores_excl.head(10)

Unnamed: 0,username,rev_id,abuse_score,sequence,damage_score,goodfaith_score
1,! Bikkit !,678070719,0.299359,5,0.010483,0.989152
2,! Bikkit !,680954373,0.129014,4,0.009104,0.991462
3,! Bikkit !,681751098,0.086982,3,0.012416,0.991418
4,! Bikkit !,695177648,0.150583,2,0.009583,0.992495
5,! Bikkit !,695241872,0.40134,1,0.007731,0.994983
6,! Bikkit !,695331950,0.18658,0,0.00772,0.995743
8,!MNc99,248068700,0.341826,11,0.038819,0.985934
9,!MNc99,257252965,0.373314,10,0.016718,0.993159
10,!MNc99,324457364,0.085382,9,0.009339,0.993489
11,!MNc99,399974597,0.341117,8,0.00755,0.994893


In [12]:
df_abuse_ores = df_abuse_ores_excl.drop(columns = ['rev_id']) # changed
df_abuse_ores.shape

(67570, 5)

In [13]:
# remove users who are onot there in test # to remove one time users
df_test = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/test_block.txt',sep = '\t')
usertest = df_test['username']
#usertest.shape

#df_abuse_ores = df_abuse_ores[df_abuse_ores['username'].isin(usertest)]
#df_abuse_ores.shape


In [14]:
usertest.shape

(4814,)

In [15]:
# data unique at user level
df_abuse_ores = df_abuse_ores.pivot(index='username', columns='sequence').swaplevel(0,1,axis=1)
df_abuse_ores.reset_index(inplace = True)
df_abuse_ores.columns = [f'{j}_{i}' for i, j in df_abuse_ores.columns]
df_abuse_ores.rename(columns={'_username': 'username'}, inplace=True)
df_abuse_ores.head()

Unnamed: 0,username,abuse_score_0,abuse_score_1,abuse_score_2,abuse_score_3,abuse_score_4,abuse_score_5,abuse_score_6,abuse_score_7,abuse_score_8,...,goodfaith_score_4,goodfaith_score_5,goodfaith_score_6,goodfaith_score_7,goodfaith_score_8,goodfaith_score_9,goodfaith_score_10,goodfaith_score_11,goodfaith_score_12,goodfaith_score_13
0,! Bikkit !,0.18658,0.40134,0.150583,0.086982,0.129014,0.299359,,,,...,0.991462,0.989152,,,,,,,,
1,!MNc99,0.082794,0.646661,0.403291,0.582592,0.060325,0.109119,0.063335,0.113142,0.341117,...,0.996017,0.978151,0.996319,0.994754,0.994893,0.993489,0.993159,0.985934,,
2,!dea4u,0.333146,0.346425,0.235225,0.1265,0.416171,,,,,...,0.997723,,,,,,,,,
3,!rehtom,0.286731,,,,,,,,,...,,,,,,,,,,
4,$200inaire,0.237923,0.272547,,,,,,,,...,,,,,,,,,,


In [16]:
df_abuse_ores.shape # matches number of users in abuse df

(11223, 43)

In [17]:
df_abuse_ores.columns

Index(['username', 'abuse_score_0', 'abuse_score_1', 'abuse_score_2',
       'abuse_score_3', 'abuse_score_4', 'abuse_score_5', 'abuse_score_6',
       'abuse_score_7', 'abuse_score_8', 'abuse_score_9', 'abuse_score_10',
       'abuse_score_11', 'abuse_score_12', 'abuse_score_13', 'damage_score_0',
       'damage_score_1', 'damage_score_2', 'damage_score_3', 'damage_score_4',
       'damage_score_5', 'damage_score_6', 'damage_score_7', 'damage_score_8',
       'damage_score_9', 'damage_score_10', 'damage_score_11',
       'damage_score_12', 'damage_score_13', 'goodfaith_score_0',
       'goodfaith_score_1', 'goodfaith_score_2', 'goodfaith_score_3',
       'goodfaith_score_4', 'goodfaith_score_5', 'goodfaith_score_6',
       'goodfaith_score_7', 'goodfaith_score_8', 'goodfaith_score_9',
       'goodfaith_score_10', 'goodfaith_score_11', 'goodfaith_score_12',
       'goodfaith_score_13'],
      dtype='object')

In [18]:
df_abuse_ores.columns=['username', 'abuse_score_1', 'abuse_score_2',
       'abuse_score_3', 'abuse_score_4', 'abuse_score_5', 'abuse_score_6',
       'abuse_score_7', 'abuse_score_8', 'abuse_score_9', 'abuse_score_10',
       'abuse_score_11', 'abuse_score_12', 'abuse_score_13', 'abuse_score_14',
        'damage_score_1', 'damage_score_2', 'damage_score_3',
       'damage_score_4', 'damage_score_5', 'damage_score_6', 'damage_score_7',
       'damage_score_8', 'damage_score_9', 'damage_score_10',
       'damage_score_11', 'damage_score_12', 'damage_score_13',
       'damage_score_14', 'goodfaith_score_1',
       'goodfaith_score_2', 'goodfaith_score_3', 'goodfaith_score_4',
       'goodfaith_score_5', 'goodfaith_score_6', 'goodfaith_score_7',
       'goodfaith_score_8', 'goodfaith_score_9', 'goodfaith_score_10',
       'goodfaith_score_11', 'goodfaith_score_12', 'goodfaith_score_13',
       'goodfaith_score_14']

In [19]:
df_abuse_ores.head()

Unnamed: 0,username,abuse_score_1,abuse_score_2,abuse_score_3,abuse_score_4,abuse_score_5,abuse_score_6,abuse_score_7,abuse_score_8,abuse_score_9,...,goodfaith_score_5,goodfaith_score_6,goodfaith_score_7,goodfaith_score_8,goodfaith_score_9,goodfaith_score_10,goodfaith_score_11,goodfaith_score_12,goodfaith_score_13,goodfaith_score_14
0,! Bikkit !,0.18658,0.40134,0.150583,0.086982,0.129014,0.299359,,,,...,0.991462,0.989152,,,,,,,,
1,!MNc99,0.082794,0.646661,0.403291,0.582592,0.060325,0.109119,0.063335,0.113142,0.341117,...,0.996017,0.978151,0.996319,0.994754,0.994893,0.993489,0.993159,0.985934,,
2,!dea4u,0.333146,0.346425,0.235225,0.1265,0.416171,,,,,...,0.997723,,,,,,,,,
3,!rehtom,0.286731,,,,,,,,,...,,,,,,,,,,
4,$200inaire,0.237923,0.272547,,,,,,,,...,,,,,,,,,,


In [20]:
#df_abuse[df_abuse['username']=='!dea4u']
# succesfully excluded the oldest revid

#### Aggregate activity data for blocked users

In [21]:
# read in blocked userlist
ipblocks = pd.read_csv("/home/ec2-user/SageMaker/bucket/wiki_trust/ipblocks_fulldump_20190223.txt", sep = "\t")
ipblocks.dropna(subset=['ipb_address'],inplace=True)

# limiting to users only blocked in 2017-2018
ipblocks_df = ipblocks[(ipblocks['date'] >= 20170115)]


In [22]:
df_user_maxrev_bl = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/usermaxrev.txt',sep = '\t')

bllist = ipblocks_df['ipb_address']
df_user_maxrev_bl['bl'] = 0
df_user_maxrev_bl['bl'][df_user_maxrev_bl['username'].isin(bllist)] = 1
df_user_maxrev_bl.head()

Unnamed: 0,username,rev_id,bl
0,! Bikkit !,695331950,0
1,!MNc99,819463253,0
2,!dea4u,877990295,0
3,!kjjk1992jk!,805028818,0
4,!rehtom,805941877,0


In [23]:
df_user_maxrev_bl.bl.value_counts()

0    14780
1    6638 
Name: bl, dtype: int64

In [24]:
# list of block and nb users
userlist_blk = df_user_maxrev_bl['username'][df_user_maxrev_bl['bl']==1]
userlist_nonblk = df_user_maxrev_bl['username'][df_user_maxrev_bl['bl']==0]

In [25]:
# corpus for revision activity
big_df = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/nontext_block.txt', sep = '\t')
big_df.shape

(3019710, 11)

In [26]:
# only keep those blocked users that are in the webscraped list
big_df = big_df[big_df['rev_user_text'].isin(userlist_blk)]

In [27]:
# get list of registered users
reguserlist = big_df['rev_user_text'][big_df['rev_user']!=0.0]

In [29]:
len(big_df.rev_user_text.unique()) # 6445

6445

In [30]:
# some formating
def col_format(dataframe):
    dataframe['revision_date'] = (dataframe['rev_timestamp']/1000000).astype(np.int64)
    dataframe.drop(columns = ['rev_page','rev_comment_id','rev_parent_id','rev_timestamp','blocked'],inplace = True)
    dataframe.columns = ['rev_id','userid','username','rev_minoredit','rev_deleted','rev_len','rev_date']
    
    return dataframe
    
df_block1 = col_format(big_df)
df_block1.head()

Unnamed: 0,rev_id,userid,username,rev_minoredit,rev_deleted,rev_len,rev_date
4,818010323,32748957.0,CookieMonster756,0,0,3829.0,20180101
20,818011163,18251141.0,Aethelwolf Emsworth,0,0,186631.0,20180101
38,818011634,18251141.0,Aethelwolf Emsworth,0,0,186539.0,20180101
46,818011765,18251141.0,Aethelwolf Emsworth,1,0,186539.0,20180101
47,818011766,0.0,67.188.179.66,0,0,15412.0,20180101


In [31]:
# adding block date to dataframe and formatting date
def addblockdate(dataframe):
    dataframe2 = pd.merge(dataframe,ipblocks,how="left",left_on=["username"],right_on=["ipb_address"])
    # difference between revision and block date (in days/weeks)
    dataframe2['date'] = pd.to_datetime(dataframe2['date'], format = "%Y%m%d")
    dataframe2['rev_date'] = pd.to_datetime(dataframe2['rev_date'], format = "%Y%m%d")
    dataframe2['diff_days'] = (dataframe2['date']-dataframe2['rev_date']).dt.days
    # only data before they were blocked
    dataframe2 = dataframe2[dataframe2['diff_days']>=0] 
    
    return dataframe2

df_block2 = addblockdate(df_block1)
df_block2.shape

(743187, 10)

In [32]:
# subset weeks 1 -8
df_block2 = df_block2.loc[(df_block2['diff_days']>=0) & (df_block2['diff_days']<=14)
                          ,['rev_id','userid','username','rev_minoredit','rev_deleted',
                                                     'rev_len','rev_date','diff_days']]

# delete column userid
df_block2.drop(columns = 'userid',inplace=True)
df_block2.shape

(177212, 7)

In [33]:
len(df_block2.username.unique())

6218

In [34]:
df_block2.head()

Unnamed: 0,rev_id,username,rev_minoredit,rev_deleted,rev_len,rev_date,diff_days
5,818011806,Damienthorne524,0,0,10499.0,2018-01-01,14
25,818014864,Freedom Fighter Jason Lin,0,0,19520.0,2018-01-01,0
26,818014937,Freedom Fighter Jason Lin,0,0,4269.0,2018-01-01,0
33,818015567,Freedom Fighter Jason Lin,0,0,4243.0,2018-01-01,0
34,818015583,Freedom Fighter Jason Lin,0,0,19520.0,2018-01-01,0


In [35]:
len(df_block2.username.unique()) # there are users with maxrevid beyond 2 week window....
# so in our analysis we are only considering users who have made a rev text within 2 weeks of getting blocked effectively.

6218

In [36]:
# exclude data max revid afterwards within that 2 week period

df_block2_1 = pd.merge(df_block2,df_user_maxrev_bl,how = 'left',on = 'username')
df_block2_1['revcount'] = df_block2_1['rev_id_y'] - df_block2_1['rev_id_x']
df_block2_1.head()

Unnamed: 0,rev_id_x,username,rev_minoredit,rev_deleted,rev_len,rev_date,diff_days,rev_id_y,bl,revcount
0,818011806,Damienthorne524,0,0,10499.0,2018-01-01,14,820500818,1,2489012
1,818014864,Freedom Fighter Jason Lin,0,0,19520.0,2018-01-01,0,818015583,1,719
2,818014937,Freedom Fighter Jason Lin,0,0,4269.0,2018-01-01,0,818015583,1,646
3,818015567,Freedom Fighter Jason Lin,0,0,4243.0,2018-01-01,0,818015583,1,16
4,818015583,Freedom Fighter Jason Lin,0,0,19520.0,2018-01-01,0,818015583,1,0


In [37]:
df_block2_2 = df_block2_1[df_block2_1['revcount'] >= 0]
df_block2_2.head(20)
#df_block2_2[df_block2_2['username']=='Freedom Fighter Jason Lin']
# filter rows < 0 ( as we only want actvity upto last edit made)
# for every user, remove rev id after maxrevid

Unnamed: 0,rev_id_x,username,rev_minoredit,rev_deleted,rev_len,rev_date,diff_days,rev_id_y,bl,revcount
0,818011806,Damienthorne524,0,0,10499.0,2018-01-01,14,820500818,1,2489012
1,818014864,Freedom Fighter Jason Lin,0,0,19520.0,2018-01-01,0,818015583,1,719
2,818014937,Freedom Fighter Jason Lin,0,0,4269.0,2018-01-01,0,818015583,1,646
3,818015567,Freedom Fighter Jason Lin,0,0,4243.0,2018-01-01,0,818015583,1,16
4,818015583,Freedom Fighter Jason Lin,0,0,19520.0,2018-01-01,0,818015583,1,0
6,818017951,Amy wamey,0,0,26108.0,2018-01-01,5,818507994,1,490043
7,818018485,I Hunger For The Jew,0,0,60299.0,2018-01-01,1,818311376,1,292891
8,818020201,HarrisonSteam,0,0,43258.0,2018-01-01,8,818021117,1,916
9,818020307,HarrisonSteam,0,0,26016.0,2018-01-01,8,818021117,1,810
10,818020789,HarrisonSteam,0,0,26016.0,2018-01-01,8,818021117,1,328


In [38]:
df_block2_2.drop(columns = ['rev_id_y','revcount','bl'],inplace = True)
df_block2_2.columns = ['rev_id','username','rev_minoredit','rev_deleted','rev_len','rev_date','diff_days']

In [39]:
len(df_block2_2['username'].unique())

5941

In [40]:
df_block2_3 = df_block2_2[df_block2_2['username'].isin(usertest)]
len(df_block2_3['username'].unique())

4814

In [41]:
#### Calculating active days for users over 2 week period
def activedays(dataframe):
    days_active = dataframe.groupby(['username', 'rev_date'],as_index=False).agg({'rev_id':"count"})
    days_active = days_active.groupby(['username'],as_index=False).agg({'rev_date':"count"})
    days_active = days_active.rename(columns={"rev_date": "active_days"})
    return days_active

days_active = activedays(df_block2_3)
days_active.head()

Unnamed: 0,username,active_days
0,$targlass,4
1,'lllllllllllllllllllllllllllllll',1
2,(127.0.0.1),3
3,(35)moo,2
4,(TIB1017DTIB1018B),1


In [42]:
#### Group by User , Week - calculate stats over 8 week period

def df_weekly(dataframe):
    dataframe2 = dataframe.groupby(['username', 'diff_days'],as_index=False).agg(
    {'rev_id':"count",'rev_minoredit':sum,'rev_deleted':sum,'rev_len':"mean"})
    
    # adding active days before block
    dataframe3 = pd.merge(dataframe2,days_active,how="left",left_on=["username"],right_on=["username"])
    dataframe3.rev_len = dataframe3.rev_len.round()
    
    # rename columns
    dataframe3.columns = ['username','days','rev_count','rev_minorcount','rev_dltcount','rev_avglen','2wkactivedays']
    dataframe3['blocked'] = 1
    
    return dataframe3
    
df_block3 = df_weekly(df_block2_3)
df_block3.shape

(12130, 8)

In [43]:
df_block3.head()

Unnamed: 0,username,days,rev_count,rev_minorcount,rev_dltcount,rev_avglen,2wkactivedays,blocked
0,$targlass,0,7,0,0,16375.0,4,1
1,$targlass,1,2,0,0,89502.0,4,1
2,$targlass,4,5,0,0,12351.0,4,1
3,$targlass,5,1,0,0,2424.0,4,1
4,'lllllllllllllllllllllllllllllll',0,12,10,3,2013.0,1,1


In [44]:
#big_df[big_df['username']=='!rehtom']

In [45]:
# extract the columns that we don't need in the grouping
df_blockcols = df_block3.loc[:,['username','2wkactivedays','blocked']]
df_blockcols.drop_duplicates(inplace = True)
df_blockcols.head()

Unnamed: 0,username,2wkactivedays,blocked
0,$targlass,4,1
4,'lllllllllllllllllllllllllllllll',1,1
5,(127.0.0.1),3,1
8,(35)moo,2,1
10,(TIB1017DTIB1018B),1,1


In [46]:
#### Pivoting the data

def df_pivot(dataframe):
    dataframe1 = dataframe.drop(columns = ['2wkactivedays','blocked'])
    dataframe2 = dataframe1.pivot(index='username', columns='days').swaplevel(0,1,axis=1)
    dataframe2.reset_index(inplace=True)
    dataframe2.columns = [f'{j}_{i}' for i, j in dataframe2.columns]
    
    # adding active days,blocked
    dataframe3 = pd.merge(dataframe2,df_blockcols,how="left",left_on=["_username"],right_on=["username"])
    dataframe3.drop(columns = ['username'],inplace = True)
    dataframe3 = dataframe3.rename(columns={'_username':'username'})
    dataframe3 = dataframe3.fillna(0)
    
    return dataframe3

df_data = df_pivot(df_block3)
df_data.shape

(4814, 63)

In [47]:
#### Normalizing rev counts,minor and deleted

df_data.columns[31:46]

Index(['rev_dltcount_0', 'rev_dltcount_1', 'rev_dltcount_2', 'rev_dltcount_3',
       'rev_dltcount_4', 'rev_dltcount_5', 'rev_dltcount_6', 'rev_dltcount_7',
       'rev_dltcount_8', 'rev_dltcount_9', 'rev_dltcount_10',
       'rev_dltcount_11', 'rev_dltcount_12', 'rev_dltcount_13',
       'rev_dltcount_14'],
      dtype='object')

In [48]:
def varnorm(dataframe):
    
    # total revision count 
    dataframe['rev_count_total'] = dataframe.iloc[:, 1:16].sum(axis=1)
    dataframe.iloc[:,1:16] = dataframe.iloc[:,1:16].div(dataframe.rev_count_total, axis=0) # normalize each revcount
    
    # minor edit count 
    dataframe['minor_count_total'] = dataframe.iloc[:, 16:31].sum(axis=1)
    dataframe['minor_count_norm'] = (dataframe['minor_count_total']/dataframe['rev_count_total']).round(4)
    
    # delete edit count 
    dataframe['dlt_count_total'] = dataframe.iloc[:,31:46].sum(axis=1)
    dataframe['dlt_count_norm'] = (dataframe['dlt_count_total']/dataframe['rev_count_total']).round(4)
    
    # drop columns
    dataframe.drop(columns = [ 'rev_minorcount_0',
       'rev_minorcount_1', 'rev_minorcount_2', 'rev_minorcount_3',
       'rev_minorcount_4', 'rev_minorcount_5', 'rev_minorcount_6',
       'rev_minorcount_7', 'rev_minorcount_8', 'rev_minorcount_9',
       'rev_minorcount_10', 'rev_minorcount_11', 'rev_minorcount_12',
       'rev_minorcount_13', 'rev_minorcount_14', 'rev_dltcount_0',
       'rev_dltcount_1', 'rev_dltcount_2', 'rev_dltcount_3', 'rev_dltcount_4',
       'rev_dltcount_5', 'rev_dltcount_6', 'rev_dltcount_7', 'rev_dltcount_8',
       'rev_dltcount_9', 'rev_dltcount_10', 'rev_dltcount_11',
       'rev_dltcount_12', 'rev_dltcount_13', 'rev_dltcount_14',                              
       'rev_count_total', 'minor_count_total', 'dlt_count_total'],inplace = True)
    
    # add reguser column
    dataframe['registered'] = np.where(dataframe['username'].isin(reguserlist),1,0)
    
    return dataframe

df_data = varnorm(df_data)
df_data.shape


(4814, 36)

In [49]:
df_data.head()

Unnamed: 0,username,rev_count_0,rev_count_1,rev_count_2,rev_count_3,rev_count_4,rev_count_5,rev_count_6,rev_count_7,rev_count_8,...,rev_avglen_10,rev_avglen_11,rev_avglen_12,rev_avglen_13,rev_avglen_14,2wkactivedays,blocked,minor_count_norm,dlt_count_norm,registered
0,$targlass,0.466667,0.133333,0.0,0.0,0.333333,0.066667,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4,1,0.0,0.0,1
1,'lllllllllllllllllllllllllllllll',1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,1,0.8333,0.25,1
2,(127.0.0.1),0.2,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,16241.0,0.0,0.0,0.0,0.0,3,1,0.0667,0.0,1
3,(35)moo,0.428571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429,...,0.0,0.0,0.0,0.0,0.0,2,1,0.0,0.0,1
4,(TIB1017DTIB1018B),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,1,0.0,0.0,1


In [50]:
df_data.iloc[0,:]

username            $targlass
rev_count_0         0.466667 
rev_count_1         0.133333 
rev_count_2         0        
rev_count_3         0        
rev_count_4         0.333333 
rev_count_5         0.0666667
rev_count_6         0        
rev_count_7         0        
rev_count_8         0        
rev_count_9         0        
rev_count_10        0        
rev_count_11        0        
rev_count_12        0        
rev_count_13        0        
rev_count_14        0        
rev_avglen_0        16375    
rev_avglen_1        89502    
rev_avglen_2        0        
rev_avglen_3        0        
rev_avglen_4        12351    
rev_avglen_5        2424     
rev_avglen_6        0        
rev_avglen_7        0        
rev_avglen_8        0        
rev_avglen_9        0        
rev_avglen_10       0        
rev_avglen_11       0        
rev_avglen_12       0        
rev_avglen_13       0        
rev_avglen_14       0        
2wkactivedays       4        
blocked             1        
minor_coun

In [51]:
df_data.columns

Index(['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered'],
      dtype='object')

In [52]:
# save file as .csv
header = ['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered']
df_data.to_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/train_block.txt', sep = '\t',encoding='utf-8',header = True,index=False)

#### Aggregate activity data for nonbl users

In [53]:
big_df = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/nontext_nonblock.txt', sep = '\t')
big_df.shape

(92741984, 11)

In [54]:
# only keep those blocked users that are in the webscraped list
big_df = big_df[big_df['rev_user_text'].isin(userlist_nonblk)]
len(big_df['rev_user_text'].unique())

14776

In [55]:
# get list of registered users
reguserlist = big_df['rev_user_text'][big_df['rev_user']!=0.0]

In [56]:
# some formating
df_block1 = col_format(big_df)
df_block1.head()

Unnamed: 0,rev_id,userid,username,rev_minoredit,rev_deleted,rev_len,rev_date
0,818010128,196446.0,BD2412,1,0,6.0,20180101
2,818010130,738200.0,James Allison,0,0,1727.0,20180101
3,818010131,29088686.0,DatBot,0,0,525.0,20180101
5,818010133,196446.0,BD2412,1,0,26634.0,20180101
13,818010141,196446.0,BD2412,0,0,71.0,20180101


In [57]:
df_block1.shape

(16227313, 7)

In [58]:
#Extracting max date for each user
no_ipb = df_block1.groupby(['username'],as_index=False).agg({'rev_date':max})
no_ipb.columns = ['username2','maxdate']

In [59]:
# adding max rev date to dataframe and formatting date
df_block2 = pd.merge(df_block1,no_ipb,how="left",left_on=["username"],right_on=["username2"])
# difference between revision and max rev date (in days/weeks)
df_block2['rev_date'] = pd.to_datetime(df_block2['rev_date'], format = "%Y%m%d")
df_block2['maxdate'] = pd.to_datetime(df_block2['maxdate'], format = "%Y%m%d")
df_block2['diff_days'] = (df_block2['maxdate']-df_block2['rev_date']).dt.days
df_block2.dropna(subset=['username'],inplace=True)
df_block2.shape

(16227313, 10)

In [60]:
# subset weeks 1-8
df_block2 = df_block2.loc[(df_block2['diff_days']>=0) & (df_block2['diff_days']<=14)
                          ,['rev_id','userid','username','rev_minoredit','rev_deleted',
                                                     'rev_len','rev_date','diff_days']]

# delete column userid
df_block2.drop(columns = 'userid',inplace=True)
df_block2.head()

Unnamed: 0,rev_id,username,rev_minoredit,rev_deleted,rev_len,rev_date,diff_days
1769,818016134,108.52.206.215,0,0,8939.0,2018-01-01,0
2013,818017008,Abce2,0,0,48548.0,2018-01-01,0
2208,818017536,128.73.237.140,0,0,4370.0,2018-01-01,10
2241,818017650,128.73.237.140,0,0,4473.0,2018-01-01,10
5536,818027098,107.77.228.106,0,0,45792.0,2018-01-01,0


In [61]:
# exclude data max revid onwards within that 2 week period
# only keep those users that are there in list
#len(df_block2[df_block2.username.isin(userlist)].username.unique())

df_block2_1 = pd.merge(df_block2,df_user_maxrev_bl,how = 'left',on = 'username')
df_block2_1['revcount'] = df_block2_1['rev_id_y'] - df_block2_1['rev_id_x']

#df_block2_1 = df_block2_1[df_block2_1['revcount']==0]
df_block2_1.head(20)

Unnamed: 0,rev_id_x,username,rev_minoredit,rev_deleted,rev_len,rev_date,diff_days,rev_id_y,bl,revcount
0,818016134,108.52.206.215,0,0,8939.0,2018-01-01,0,797671214,0,-20344920
1,818017008,Abce2,0,0,48548.0,2018-01-01,0,818017008,0,0
2,818017536,128.73.237.140,0,0,4370.0,2018-01-01,10,819331356,0,1313820
3,818017650,128.73.237.140,0,0,4473.0,2018-01-01,10,819331356,0,1313706
4,818027098,107.77.228.106,0,0,45792.0,2018-01-01,0,736419456,0,-81607642
5,818029442,106.213.180.194,0,0,247683.0,2018-01-01,0,818029442,0,0
6,818038341,128.73.237.140,0,0,4636.0,2018-01-01,10,819331356,0,1293015
7,818038541,128.73.237.140,0,0,1652.0,2018-01-01,10,819331356,0,1292815
8,818038560,128.73.237.140,0,0,1652.0,2018-01-01,10,819331356,0,1292796
9,818038609,128.73.237.140,0,0,5191.0,2018-01-01,10,819331356,0,1292747


In [62]:
df_block2_2 = df_block2_1[df_block2_1['revcount'] >= 0] # greater than excludes that particular max revid and everything after that
df_block2_2.head(20)
# filter rows <= 0 (removes ax rev id and any rev made after block date)
# for every user, remove rev id greater equal to maxrevid

Unnamed: 0,rev_id_x,username,rev_minoredit,rev_deleted,rev_len,rev_date,diff_days,rev_id_y,bl,revcount
1,818017008,Abce2,0,0,48548.0,2018-01-01,0,818017008,0,0
2,818017536,128.73.237.140,0,0,4370.0,2018-01-01,10,819331356,0,1313820
3,818017650,128.73.237.140,0,0,4473.0,2018-01-01,10,819331356,0,1313706
5,818029442,106.213.180.194,0,0,247683.0,2018-01-01,0,818029442,0,0
6,818038341,128.73.237.140,0,0,4636.0,2018-01-01,10,819331356,0,1293015
7,818038541,128.73.237.140,0,0,1652.0,2018-01-01,10,819331356,0,1292815
8,818038560,128.73.237.140,0,0,1652.0,2018-01-01,10,819331356,0,1292796
9,818038609,128.73.237.140,0,0,5191.0,2018-01-01,10,819331356,0,1292747
10,818038701,128.73.237.140,0,0,1709.0,2018-01-01,10,819331356,0,1292655
11,818038765,128.73.237.140,0,0,1719.0,2018-01-01,10,819331356,0,1292591


In [63]:
df_block2_2.drop(columns = ['rev_id_y','revcount','bl'],inplace = True)
df_block2_2.columns = ['rev_id','username','rev_minoredit','rev_deleted','rev_len','rev_date','diff_days']

In [64]:
len(df_block2_2['username'].unique()) # 6509 unique users

9658

In [65]:
df_test = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/test_nonblock.txt',sep = '\t')
usertest = df_test['username']
usertest.shape

(6466,)

In [66]:
df_block2_3 = df_block2_2[df_block2_2['username'].isin(usertest)]
len(df_block2_3['username'].unique())

6466

In [67]:
df_block2_3.head(10)

Unnamed: 0,rev_id,username,rev_minoredit,rev_deleted,rev_len,rev_date,diff_days
1,818017008,Abce2,0,0,48548.0,2018-01-01,0
2,818017536,128.73.237.140,0,0,4370.0,2018-01-01,10
3,818017650,128.73.237.140,0,0,4473.0,2018-01-01,10
6,818038341,128.73.237.140,0,0,4636.0,2018-01-01,10
7,818038541,128.73.237.140,0,0,1652.0,2018-01-01,10
8,818038560,128.73.237.140,0,0,1652.0,2018-01-01,10
9,818038609,128.73.237.140,0,0,5191.0,2018-01-01,10
10,818038701,128.73.237.140,0,0,1709.0,2018-01-01,10
11,818038765,128.73.237.140,0,0,1719.0,2018-01-01,10
12,818038834,128.73.237.140,0,0,1727.0,2018-01-01,10


In [68]:
# get active days for users
days_active = activedays(df_block2_3)
days_active.head()

Unnamed: 0,username,active_days
0,!dea4u,1
1,!kjjk1992jk!,1
2,!rehtom,6
3,$imbeio$i$,6
4,'DesoHaa,4


In [69]:
df_block3 = df_weekly(df_block2_3)
df_block3.shape

(25380, 8)

In [70]:
# extract the columns that we don't need in the grouping
df_block3['blocked'] = 0
df_blockcols = df_block3.loc[:,['username','2wkactivedays','blocked']]
df_blockcols.drop_duplicates(inplace = True)
df_blockcols.head()

Unnamed: 0,username,2wkactivedays,blocked
0,!dea4u,1,0
1,!kjjk1992jk!,1,0
2,!rehtom,6,0
8,$imbeio$i$,6,0
14,'DesoHaa,4,0


In [71]:
df_data = df_pivot(df_block3)
df_data.shape

(6466, 63)

In [72]:
len(df_data['username'].unique())

6466

In [73]:
df_data.sample(5)

Unnamed: 0,username,rev_count_0,rev_count_1,rev_count_2,rev_count_3,rev_count_4,rev_count_5,rev_count_6,rev_count_7,rev_count_8,...,rev_avglen_7,rev_avglen_8,rev_avglen_9,rev_avglen_10,rev_avglen_11,rev_avglen_12,rev_avglen_13,rev_avglen_14,2wkactivedays,blocked
5475,EPBeatles,21.0,0.0,1.0,0.0,1.0,8.0,7.0,0.0,9.0,...,0.0,10293.0,15319.0,10232.0,9410.0,5661.0,12638.0,4410.0,12,0
3290,113.203.198.24,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
5392,Donald Albury,7.0,21.0,19.0,14.0,10.0,7.0,9.0,3.0,10.0,...,33764.0,44348.0,23148.0,25232.0,16334.0,71485.0,52583.0,42628.0,15,0
2774,110.149.134.29,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
2080,109.125.19.212,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0


In [74]:
df_data.columns

Index(['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_minorcount_0',
       'rev_minorcount_1', 'rev_minorcount_2', 'rev_minorcount_3',
       'rev_minorcount_4', 'rev_minorcount_5', 'rev_minorcount_6',
       'rev_minorcount_7', 'rev_minorcount_8', 'rev_minorcount_9',
       'rev_minorcount_10', 'rev_minorcount_11', 'rev_minorcount_12',
       'rev_minorcount_13', 'rev_minorcount_14', 'rev_dltcount_0',
       'rev_dltcount_1', 'rev_dltcount_2', 'rev_dltcount_3', 'rev_dltcount_4',
       'rev_dltcount_5', 'rev_dltcount_6', 'rev_dltcount_7', 'rev_dltcount_8',
       'rev_dltcount_9', 'rev_dltcount_10', 'rev_dltcount_11',
       'rev_dltcount_12', 'rev_dltcount_13', 'rev_dltcount_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',


In [75]:
df_data = varnorm(df_data)
df_data.shape

(6466, 36)

In [76]:
df_data.columns

Index(['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered'],
      dtype='object')

In [77]:
# save file as .csv
header = ['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered']
df_data.to_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/train_nonblock.txt', sep = '\t',encoding='utf-8',header = True,index=False)

#### Combine bl + nonbl activity corpus with abuse + ores data

In [78]:
#read in blocked users data
df_block = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/train_block.txt', sep = '\t')
df_block.shape

(4814, 36)

In [79]:
#read in non-blocked users data
df_nonblock = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/train_nonblock.txt', sep = '\t')
df_nonblock.shape

(6466, 36)

In [80]:
df_act = pd.concat([df_block,df_nonblock])
df_act.shape

(11280, 36)

In [81]:
df_act.tail()

Unnamed: 0,username,rev_count_0,rev_count_1,rev_count_2,rev_count_3,rev_count_4,rev_count_5,rev_count_6,rev_count_7,rev_count_8,...,rev_avglen_10,rev_avglen_11,rev_avglen_12,rev_avglen_13,rev_avglen_14,2wkactivedays,blocked,minor_count_norm,dlt_count_norm,registered
6461,Jonie148,0.166667,0.0,0.166667,0.166667,0.0,0.0,0.166667,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,6,0,0.1667,0.0,1
6462,Jonny2x4,0.555556,0.0,0.111111,0.0,0.0,0.0,0.222222,0.0,0.0,...,0.0,0.0,0.0,0.0,29990.0,4,0,0.4444,0.0,1
6463,Jonpatterns,0.044944,0.235955,0.044944,0.011236,0.0,0.011236,0.0,0.123596,0.078652,...,63859.0,50217.0,28320.0,0.0,195242.0,12,0,0.0,0.0,1
6464,Jontel,0.041237,0.072165,0.082474,0.072165,0.113402,0.041237,0.103093,0.051546,0.195876,...,58250.0,40141.0,34135.0,9726.0,47908.0,15,0,0.1959,0.0,1
6465,Jonytano20,0.05,0.05,0.0,0.05,0.25,0.05,0.1,0.25,0.1,...,0.0,20805.0,0.0,0.0,20731.0,10,0,0.0,0.0,1


In [82]:
df_act['blocked'].value_counts()

0    6466
1    4814
Name: blocked, dtype: int64

In [83]:
df_abuse_ores.head()

Unnamed: 0,username,abuse_score_1,abuse_score_2,abuse_score_3,abuse_score_4,abuse_score_5,abuse_score_6,abuse_score_7,abuse_score_8,abuse_score_9,...,goodfaith_score_5,goodfaith_score_6,goodfaith_score_7,goodfaith_score_8,goodfaith_score_9,goodfaith_score_10,goodfaith_score_11,goodfaith_score_12,goodfaith_score_13,goodfaith_score_14
0,! Bikkit !,0.18658,0.40134,0.150583,0.086982,0.129014,0.299359,,,,...,0.991462,0.989152,,,,,,,,
1,!MNc99,0.082794,0.646661,0.403291,0.582592,0.060325,0.109119,0.063335,0.113142,0.341117,...,0.996017,0.978151,0.996319,0.994754,0.994893,0.993489,0.993159,0.985934,,
2,!dea4u,0.333146,0.346425,0.235225,0.1265,0.416171,,,,,...,0.997723,,,,,,,,,
3,!rehtom,0.286731,,,,,,,,,...,,,,,,,,,,
4,$200inaire,0.237923,0.272547,,,,,,,,...,,,,,,,,,,


In [84]:
df_abuse_ores.columns

Index(['username', 'abuse_score_1', 'abuse_score_2', 'abuse_score_3',
       'abuse_score_4', 'abuse_score_5', 'abuse_score_6', 'abuse_score_7',
       'abuse_score_8', 'abuse_score_9', 'abuse_score_10', 'abuse_score_11',
       'abuse_score_12', 'abuse_score_13', 'abuse_score_14', 'damage_score_1',
       'damage_score_2', 'damage_score_3', 'damage_score_4', 'damage_score_5',
       'damage_score_6', 'damage_score_7', 'damage_score_8', 'damage_score_9',
       'damage_score_10', 'damage_score_11', 'damage_score_12',
       'damage_score_13', 'damage_score_14', 'goodfaith_score_1',
       'goodfaith_score_2', 'goodfaith_score_3', 'goodfaith_score_4',
       'goodfaith_score_5', 'goodfaith_score_6', 'goodfaith_score_7',
       'goodfaith_score_8', 'goodfaith_score_9', 'goodfaith_score_10',
       'goodfaith_score_11', 'goodfaith_score_12', 'goodfaith_score_13',
       'goodfaith_score_14'],
      dtype='object')

In [85]:
df_abuse_ores_act = pd.merge (df_act,df_abuse_ores,how="inner",on=["username"])
df_abuse_ores_act.shape

(7985, 78)

In [86]:
df_abuse_ores_act.head()

Unnamed: 0,username,rev_count_0,rev_count_1,rev_count_2,rev_count_3,rev_count_4,rev_count_5,rev_count_6,rev_count_7,rev_count_8,...,goodfaith_score_5,goodfaith_score_6,goodfaith_score_7,goodfaith_score_8,goodfaith_score_9,goodfaith_score_10,goodfaith_score_11,goodfaith_score_12,goodfaith_score_13,goodfaith_score_14
0,(127.0.0.1),0.2,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.996875,0.996484,0.996302,,,,,,,
1,(35)moo,0.428571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429,...,,,,,,,,,,
2,(TIB1017DTIB1018B),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,(TSB-98989898959359343848438468678378728872402042042844838438438),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,...(his name isn't Charles),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [87]:
df_abuse_ores_act.iloc[0,:]

username              (127.0.0.1)
rev_count_0           0.2        
rev_count_1           0.666667   
rev_count_2           0          
rev_count_3           0          
rev_count_4           0          
rev_count_5           0          
rev_count_6           0          
rev_count_7           0          
rev_count_8           0          
rev_count_9           0          
rev_count_10          0.133333   
rev_count_11          0          
rev_count_12          0          
rev_count_13          0          
rev_count_14          0          
rev_avglen_0          9720       
rev_avglen_1          10261      
rev_avglen_2          0          
rev_avglen_3          0          
rev_avglen_4          0          
rev_avglen_5          0          
rev_avglen_6          0          
rev_avglen_7          0          
rev_avglen_8          0          
rev_avglen_9          0          
rev_avglen_10         16241      
rev_avglen_11         0          
rev_avglen_12         0          
rev_avglen_13 

In [88]:
df_abuse_ores_act.columns

Index(['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered', 'abuse_score_1',
       'abuse_score_2', 'abuse_score_3', 'abuse_score_4', 'abuse_score_5',
       'abuse_score_6', 'abuse_score_7', 'abuse_score_8', 'abuse_score_9',
       'abuse_score_10', 'abuse_score_11', 'abuse_score_12', 'abuse_score_13',
       'abuse_score_14', 'damage_score_1', 'damage_score_2', 'damage_score_3',
       'damage_score_4', 'damage_score_5', 'd

In [89]:
# save file as .csv
header = ['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered', 'abuse_score_1',
       'abuse_score_2', 'abuse_score_3', 'abuse_score_4', 'abuse_score_5',
       'abuse_score_6', 'abuse_score_7', 'abuse_score_8', 'abuse_score_9',
       'abuse_score_10', 'abuse_score_11', 'abuse_score_12', 'abuse_score_13',
       'abuse_score_14', 'damage_score_1', 'damage_score_2', 'damage_score_3',
       'damage_score_4', 'damage_score_5', 'damage_score_6', 'damage_score_7',
       'damage_score_8', 'damage_score_9', 'damage_score_10',
       'damage_score_11', 'damage_score_12', 'damage_score_13',
       'damage_score_14', 'goodfaith_score_1', 'goodfaith_score_2',
       'goodfaith_score_3', 'goodfaith_score_4', 'goodfaith_score_5',
       'goodfaith_score_6', 'goodfaith_score_7', 'goodfaith_score_8',
       'goodfaith_score_9', 'goodfaith_score_10', 'goodfaith_score_11',
       'goodfaith_score_12', 'goodfaith_score_13', 'goodfaith_score_14']
df_abuse_ores_act.to_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/train_abuse_ores_act.txt', sep = '\t',encoding='utf-8',header = True,index=False)