In [1]:
% matplotlib inline
import pandas as pd
from dateutil.relativedelta import relativedelta
import statsmodels.formula.api as sm
import requests

# Find harassed editors

In [2]:
# load scord diffs for 2015, drop admin and bot messages
usecols = [3,5,7,8,9,10,11,12,13]
years = range(2001,2016)
threshold = 0.425

dfs = []

for year in years:

        df = pd.read_csv("../../data/figshare/scored/comments_user_%d.tsv.gz" % year,
                         sep = "\t",
                         compression = "gzip",
                         usecols = usecols)
        df = df.query("bot == 0 and admin == 0")
        
        
        df = df.query("pred_attack_score > %f \
                       or pred_aggression_score > %f \
                       or pred_toxicity_score > %f" % (threshold, threshold, threshold))\
               .query("user_text != page_title")[['page_title']]
    
    
        dfs.append(df)
        print(df.shape)

df_attacked_users = pd.concat(dfs).drop_duplicates()
print(df_attacked_users.shape[0])

(1, 1)
(64, 1)
(463, 1)
(1947, 1)
(7997, 1)
(24398, 1)
(27906, 1)
(24882, 1)
(19225, 1)
(14744, 1)
(10818, 1)
(9922, 1)
(9041, 1)
(8817, 1)
(8588, 1)
52780


# Create Newcomer Sample

We will select all newcomers who received some form harassment as determined by one of our comment-level harassment classifiers and a sample of 100000 randomly selected newcomers.

In [4]:
n_random = 100000
df_newcomers = pd.read_csv("../../data/retention/user_start.tsv", "\t")[['user_text']]
df_random_newcomers = df_newcomers.sample(n_random, random_state = 12)
df_attacked_newcomers = df_attacked_users.merge(df_newcomers, right_on = 'user_text', left_on = 'page_title')[['user_text']]
df_newcomer_sample = pd.concat([df_random_newcomers, df_attacked_newcomers]).drop_duplicates()
df_newcomer_sample.to_csv("../../data/retention/newcomer_sample.csv")
print("Num atttacked newcomers: ", df_attacked_newcomers.shape[0])
print("Sample Size: ", df_newcomer_sample.shape[0])

Num atttacked newcomers:  20320
Sample Size:  120036


# Load Data for newcomer sample


The data used in this analysis includes:
1. all user and article talk page comments, labeled by harassment classifiers, except those generated by bots or templates
2. all newly registered users, who made at least one edit
3. edits per day per namespace for all newcomers
4. user warnings received by 2015 newcomers in
5. genders of all editors if available

In [6]:
# load scord diffs for 2015, drop admin and bot messages
usecols = [3,5,7,8,9,10,11,12,13]
years = range(2001,2016)
nss = ['user', 'article']

dfs = []

for year in years:
    for ns in nss:

        df = pd.read_csv("../../data/figshare/scored/comments_%s_%d.tsv.gz" % (ns, year),
                         sep = "\t",
                         compression = "gzip",
                         usecols = usecols)
        df['ns'] = ns
        df = df.query("bot == 0 and admin == 0")
        df = df.merge(df_newcomer_sample, how = 'inner', on = 'user_text')
        dfs.append(df)
        print(df.shape)

df_annotated = pd.concat(dfs)
df_annotated['timestamp'] = pd.to_datetime(df_annotated['timestamp'])
print(df_annotated.shape[0])

(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(2, 10)
(1, 10)
(9, 10)
(2, 10)
(182785, 10)
(133734, 10)
(620560, 10)
(404339, 10)
(697303, 10)
(455657, 10)
(613904, 10)
(447507, 10)
(552919, 10)
(400483, 10)
(475518, 10)
(376810, 10)
(442438, 10)
(332057, 10)
(380078, 10)
(316875, 10)
(322693, 10)
(305246, 10)
(324747, 10)
(288185, 10)
8073852


In [11]:
# registration times of all editors in sample
df_user_start = pd.read_csv("../../data/retention/user_start.tsv", "\t")
df_user_start = df_user_start.drop_duplicates('user_text')
df_user_start = df_user_start.merge(df_newcomer_sample, how = 'inner', on = 'user_text')
df_user_start['registration_day'] = pd.to_datetime(df_user_start['registration_day'], format = '%Y%m%d')
df_user_start['first_edit_day'] = pd.to_datetime(df_user_start['first_edit_day'], format = '%Y%m%d')
print(df_user_start.shape[0])

120036


In [12]:
# load edits per day for editors in sample
df_edits = pd.read_csv("../../data/retention/daily_revision_counts.tsv", "\t")
print(df_edits.shape[0])
df_edits = df_edits.drop_duplicates(['user_text', 'day', 'ns'])
print(df_edits.shape[0])
df_edits = df_edits.merge(df_newcomer_sample, how = 'inner', on = 'user_text')
df_edits['timestamp'] = pd.to_datetime(df_edits['day'].apply(lambda x: str(x)))
print(df_edits.shape[0])

91894576
91893632
13135121


In [13]:
# load user warnings for editors in sample
df_uw = pd.read_csv("../../data/retention/user_warnings.tsv", "\t")
df_uw = df_uw.merge(df_newcomer_sample, how = 'inner', on = 'user_text')
df_uw['timestamp'] = pd.to_datetime(df_uw['rev_timestamp'])

In [14]:
# genders for all editors
df_gender = pd.read_csv("../../data/misc/genders.tsv", "\t")
df_gender = df_gender.merge(df_newcomer_sample, how = 'inner', on = 'user_text')

In [15]:
# create df of consolidated user level features
df_user = df_user_start.merge(df_gender, on = 'user_text', how = "left")[['user_text', 'gender']]
df_user['gender'] = df_user['gender'].fillna('unknown')
df_user = df_user.merge(df_user_start, on = 'user_text', how = "inner")[['user_text', 'gender', 'registration_day', 'first_edit_day']]
del df_user_start
del df_gender

In [16]:
df_user.shape

(120036, 4)

# Create User Objects
To be able help with extracting user level features, we group data sources above by user and store the results in a dedicated `User` object.

In [17]:
# map data frames into dictionaries keyed by user
def gb_to_dict(gb):
    return { i:k for i,k in gb}

df_annotated_user_text_groups = gb_to_dict(df_annotated.groupby("user_text"))
df_annotated_page_title_groups =  gb_to_dict(df_annotated.query("ns == 'user'").groupby("page_title"))
df_edits_groups =  gb_to_dict(df_edits.groupby("user_text"))
df_user_groups =  gb_to_dict(df_user.groupby("user_text"))
df_uw_groups =  gb_to_dict(df_uw.groupby("page_title")) # page title is the recipient of the uw

In [18]:
# collect User objects 
class User():
    def __init__(self, user_text, df_annotated_user_text_groups, df_annotated_page_title_groups, df_edits_groups, df_user_groups, df_uw_groups):
        self.user_text = user_text
        self.df_activity =  df_edits_groups.get(user_text, None)
        self.df_comments_made =  df_annotated_user_text_groups.get(user_text, None)
        self.df_comments_received = df_annotated_page_title_groups.get(user_text, None)
        self.df_uw = df_uw_groups.get(user_text, None)
        if self.df_comments_received is not None:
            self.df_comments_received = self.df_comments_received.query("ns == 'user' and user_text != page_title")
        self.gender = df_user_groups[user_text]['gender'].iloc[0]
        self.registration_day = df_user_groups[user_text]['registration_day'].iloc[0]
        self.first_edit_day = df_user_groups[user_text]['first_edit_day'].iloc[0]

In [24]:
df_newcomer_sample = df_newcomer_sample.dropna()

In [26]:
user_objects = [User( user_text,
                      df_annotated_user_text_groups,
                      df_annotated_page_title_groups,
                      df_edits_groups,
                      df_user_groups, 
                      df_uw_groups) 
                for user_text in df_newcomer_sample['user_text']]

In [27]:
import pickle
pickle.dump(user_objects, open("../../data/retention/newcomer_sample_pickle.pkl", "wb"))

### Feature extraction

Our measures of user activity over a time span include:
1. number of edits in all namespaces
2. number of days active (a user is active on a day if they make at least on edit in any namespace)
3. number of edit sessions (an edit session is a sequence of edits without a gap of 60 minutes or more)
4. indicator of whether the user made at least one edit in any namespace


Our measures of harassment received/made over a time span are:
1. number of a comments received/made that classifier `clf` scored above `threshold`
2. number of a comments received/made that scored above `threshold` for any of our 3 harassment classifers
4. indicator of whether the user received/made at least one comment that scored above `threshold` for any of our 3 harassment classifiers


We also gather:
1. each users gender
2. and the number of user warnings the editor received

As mentioned above we, gather activity and harassment features for newcomers in timespan t1 and see how they correlate with activity features in timespan t2.

In the following analysis, the two time spans we are interested in are the first and second month after user registration.

In [28]:
def select_month_since_registration(user,  activity, t):
    start = user.registration_day + relativedelta(months=(t-1))
    stop = user.registration_day + relativedelta(months= t)
    activity = activity[activity['timestamp'] < stop]
    activity = activity[activity['timestamp'] >= start]
    return activity

def count_edits(user, t):
    activity = user.df_activity
    if activity is None:
        return 0
    activity = select_month_since_registration(user,  activity, t)
    return activity['n_revisions'].sum()

def count_ns0_revisions(user, t):
    activity = user.df_activity
    if activity is None:
        return 0
    activity = select_month_since_registration(user,  activity, t)
    activity = activity.query("ns=='0'")
    return activity['n_revisions'].sum()


def count_days_active(user, t):
    activity = user.df_activity
    if user.df_activity is None:
        return 0
    activity = select_month_since_registration(user,  activity, t)
    return len(activity.timestamp.unique())

def count_score_received_above_threshold(user, score, threshold, t):
    if user.df_comments_received is None:
        return 0
    
    comments = user.df_comments_received
    comments = select_month_since_registration(user,  comments, t)
    return (comments[score] > threshold).sum()

def count_score_made_above_threshold(user, score, threshold, t):
    if user.df_comments_made is None:
        return 0
    
    comments = user.df_comments_made
    comments = select_month_since_registration(user,  comments, t)
    return (comments[score] > threshold).sum()

def is_female(u):
    return int(u.gender == 'female')

def is_male(u):
    return int(u.gender == 'male')

def count_warnings_received(user, t):
    warnings = user.df_uw
    if warnings is None:
        return 0
    warnings = select_month_since_registration(user, warnings, t)
    return len(warnings)

def count_fraction_of_ns0_revisions_x(user, x, t):
    
    if user.df_activity is None:
        return 0
    
    activity = user.df_activity.query("ns=='0'")
    activity = select_month_since_registration(user,  activity, t)
        
    if activity['n_revisions'].sum() < 1:
        return 0
    
    return  float(activity[x].sum()) / activity['n_revisions'].sum()
    


In [29]:
df_features = pd.DataFrame({
        'registration_day' : [u.registration_day for u in user_objects],
        't1_num_ns0_edits' : [count_ns0_revisions(u, 1) for u in user_objects],
        'user_text' : [u.user_text for u in user_objects],
        'is_female' : [is_female(u) for u in user_objects],
        'is_male' : [is_male(u) for u in user_objects],
        't1_num_edits' : [count_edits(u, 1) for u in user_objects],
        't2_num_edits' : [count_edits(u, 2) for u in user_objects],
        't1_num_days_active' : [count_days_active(u, 1) for u in user_objects],
        't2_num_days_active' : [count_days_active(u, 2) for u in user_objects],
        't1_num_attacks_received' : [count_score_received_above_threshold(u, 'pred_attack_score',  threshold, 1) for u in user_objects],
        't1_num_aggression_received' : [count_score_received_above_threshold(u,  'pred_aggression_score',  threshold, 1) for u in user_objects],
        't1_num_toxicity_received' : [count_score_received_above_threshold(u,  'pred_toxicity_score',  threshold, 1) for u in user_objects],
        't1_num_attacks_made' : [count_score_made_above_threshold(u, 'pred_attack_score',  threshold, 1) for u in user_objects],
        't1_num_aggresssion_made': [count_score_made_above_threshold(u,  'pred_aggression_score',  threshold, 1) for u in user_objects],
        't1_num_toxicity_made': [count_score_made_above_threshold(u,  'pred_toxicity_score',  threshold, 1) for u in user_objects],
        't1_num_warnings_recieved' : [count_warnings_received(u, 1) for u in user_objects],
        't1_fraction_ns0_deleted' : [count_fraction_of_ns0_revisions_x(u, 'n_deleted_revisions', 1) for u in user_objects],
        't1_fraction_ns0_reverted' : [count_fraction_of_ns0_revisions_x(u, 'n_identity_reverted_revisions', 1) for u in user_objects],
        't1_fraction_ns0_productive' : [count_fraction_of_ns0_revisions_x(u, 'n_productive_revisions', 1) for u in user_objects],
        

    })

df_features.shape

df_features['t1_active'] = (df_features['t1_num_edits'] > 0).apply(int)
df_features['t2_active'] = (df_features['t2_num_edits'] > 0).apply(int)
df_features['t1_harassment_received'] = ((df_features['t1_num_attacks_received'] > 0) | (df_features['t1_num_aggression_received'] > 0) | (df_features['t1_num_toxicity_received'] > 0)).apply(int)
df_features['t1_harassment_made'] = ((df_features['t1_num_attacks_made'] > 0) | (df_features['t1_num_aggresssion_made'] > 0) | (df_features['t1_num_toxicity_made'] > 0)).apply(int)
df_features['has_gender'] = ((df_features["is_female"] == 1) | (df_features["is_male"] == 1)).apply(int)




df_features.index = df_features.user_text
del df_features['user_text']
print(df_features.shape)
df_active = df_features.query('t1_active == 1')
print(df_active.shape[0])
df_active.to_csv("../../data/retention/newcomer_sample_features.csv")

(120035, 23)
111290


In [34]:
df_active.head()

Unnamed: 0_level_0,is_female,is_male,registration_day,t1_fraction_ns0_deleted,t1_fraction_ns0_productive,t1_fraction_ns0_reverted,t1_num_aggression_received,t1_num_aggresssion_made,t1_num_attacks_made,t1_num_attacks_received,...,t1_num_toxicity_made,t1_num_toxicity_received,t1_num_warnings_recieved,t2_num_days_active,t2_num_edits,t1_active,t2_active,t1_harassment_received,t1_harassment_made,has_gender
user_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Chrisclements521,0,0,2010-10-01,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Callyloo,0,0,2007-11-21,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Eagleswar262,0,0,2009-08-04,0.0,1.0,0.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
ILY=)L0Li,0,0,2010-04-21,0.0,0.0,1.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Dirtyharry847,0,0,2006-05-02,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
