In [1]:
# Import necessary libraries

import pandas as pd

import numpy as np

import time

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score

from sklearn.model_selection import KFold, train_test_split

from sklearn.utils import shuffle

In [2]:
# Read in data

t0 = time.time()
df = pd.read_csv('aggregate-20160501.csv')
t1 = time.time()
print(str((t1-t0)/60) + " minutes")

6.669565852483114 minutes


In [3]:
#df.head()
#df.shape
#df.columns
#set(df.num_edits)

In [4]:
# Calculate percentage of unedited articles in original imbalanced data

len(df["num_edits"][df["num_edits"] == 0])/len(df["num_edits"])*100

99.91720181519122

In [5]:
# Check missingness in original imbalanced data

print(df.isnull().sum())

article_name                   1
num_edits                      0
views_30d                      0
views_7d                       0
views_3d                       0
views_1d                       0
edits_30d                      0
edits_7d                       0
edits_3d                       0
edits_1d                       0
minor_edits_30d                0
minor_edits_7d                 0
minor_edits_3d                 0
minor_edits_1d                 0
avg_size_30d             6428583
avg_size_7d             45268487
avg_size_3d             49800824
avg_size_1d             53028939
avg_size                  209968
latest_size               209968
talk_views_30d                 0
talk_views_7d                  0
talk_views_3d                  0
talk_views_1d                  0
talk_edits_30d                 0
talk_edits_7d                  0
talk_edits_3d                  0
talk_edits_1d                  0
talk_minor_edits_30d           0
talk_minor_edits_7d            0
talk_minor

In [6]:
# Generate binary edited vs. unedited variable

df['num_edits_binary'] = df['num_edits'].apply(lambda x: int(x > 0))
#set(df['num_edits_binary'])

In [7]:
# Store relevant variable names in a list

feature_names = [x for x in df.columns if 'num_edits' not in x and 'article_name' not in x]

label_name = "num_edits_binary"

feature_names

['views_30d',
 'views_7d',
 'views_3d',
 'views_1d',
 'edits_30d',
 'edits_7d',
 'edits_3d',
 'edits_1d',
 'minor_edits_30d',
 'minor_edits_7d',
 'minor_edits_3d',
 'minor_edits_1d',
 'avg_size_30d',
 'avg_size_7d',
 'avg_size_3d',
 'avg_size_1d',
 'avg_size',
 'latest_size',
 'talk_views_30d',
 'talk_views_7d',
 'talk_views_3d',
 'talk_views_1d',
 'talk_edits_30d',
 'talk_edits_7d',
 'talk_edits_3d',
 'talk_edits_1d',
 'talk_minor_edits_30d',
 'talk_minor_edits_7d',
 'talk_minor_edits_3d',
 'talk_minor_edits_1d',
 'talk_avg_size_30d',
 'talk_avg_size_7d',
 'talk_avg_size_3d',
 'talk_avg_size_1d',
 'talk_avg_size',
 'talk_latest_size']

In [8]:
## Generate balanced sample

t0 = time.time()

np.random.seed(seed=13579)

# to be set aside as imbalanced test data
set1_idx = np.random.choice(range(len(df)), int(len(df) * .5), replace=False)
set1_X = df.loc[set1_idx, feature_names]
set1_Y = df.loc[set1_idx, label_name]

# to be used to generate balanced data for main analysis
set2_idx = list(set(range(len(df))) - set(list(set1_idx)))
set2 = df.loc[set2_idx, list(df.columns.tolist())]

edited = set2[set2.num_edits > 0.0].copy(deep=True)
#edited.shape

not_edited = set2[set2.num_edits == 0.0].copy(deep=True)

not_edited_selected = not_edited[0:edited.shape[0]]

balanced_set = pd.concat([edited, not_edited_selected])

balanced_set = shuffle(balanced_set)
#balanced_set.shape

t1 = time.time()
print(str((t1-t0)/60) + " minutes")

5.176140817006429 minutes


In [9]:
print(edited.shape)
print(balanced_set.shape)

(25949, 39)
(51898, 39)


In [14]:
# Write balanced data to csv

balanced_set.to_csv('balanced_set_20160501.csv', sep=',')
#balanced_set.head()

In [2]:
# Read in balanced data

df_b = pd.read_csv('balanced_set_20160501.csv')
df_b = df_b.set_index('Unnamed: 0')
df_b.index.name = None
#df_b.head()

In [3]:
# Calculate percentage of unedited articles in balanced data

len(df_b["num_edits_binary"][df_b["num_edits_binary"] == 0])/len(df_b["num_edits_binary"])*100

50.0

In [5]:
# Check missingness in balanced data

print(df_b.isnull().sum())

article_name                0
num_edits                   0
views_30d                   0
views_7d                    0
views_3d                    0
views_1d                    0
edits_30d                   0
edits_7d                    0
edits_3d                    0
edits_1d                    0
minor_edits_30d             0
minor_edits_7d              0
minor_edits_3d              0
minor_edits_1d              0
avg_size_30d              331
avg_size_7d              4693
avg_size_3d              6052
avg_size_1d              6771
avg_size                    0
latest_size                 0
talk_views_30d              0
talk_views_7d               0
talk_views_3d               0
talk_views_1d               0
talk_edits_30d              0
talk_edits_7d               0
talk_edits_3d               0
talk_edits_1d               0
talk_minor_edits_30d        0
talk_minor_edits_7d         0
talk_minor_edits_3d         0
talk_minor_edits_1d         0
talk_avg_size_30d       11189
talk_avg_s

In [15]:
# Generate features

# Views

# Version 1

df_b['change_in_views_v1_7_30'] = df_b.views_7d/4 - df_b.views_30d/23

df_b['change_in_views_v1_3_7'] = df_b.views_3d/2 - df_b.views_7d/4 

df_b['change_in_views_v1_1_3'] = df_b.views_1d/1 - df_b.views_3d/2

# Version 2

df_b['change_in_views_v2_7_30'] = ((df_b.views_7d/4)+1)/((df_b.views_30d/23)+1)

df_b['change_in_views_v2_3_7'] = ((df_b.views_3d/2)+1)/((df_b.views_7d/4)+1)

df_b['change_in_views_v2_1_3'] = ((df_b.views_1d/1)+1)/((df_b.views_3d/2)+1)

In [16]:
df_b['change_in_views_v2_7_30'].describe()

count    51898.000000
mean         1.168456
std         12.958831
min          0.008131
25%          0.759434
50%          0.930747
75%          1.068584
max       2324.500000
Name: change_in_views_v2_7_30, dtype: float64

In [19]:
# Edits

# Version 1

df_b['change_in_edits_v1_7_30'] = df_b.edits_7d/4 - df_b.edits_30d/23

df_b['change_in_edits_v1_3_7'] = df_b.edits_3d/2 - df_b.edits_7d/4

df_b['change_in_edits_v1_1_3'] = df_b.edits_1d/1 - df_b.edits_3d/2

# Version 2

df_b['change_in_edits_v2_7_30'] = ((df_b.edits_7d/4)+1)/((df_b.edits_30d/23)+1)

df_b['change_in_edits_v2_3_7'] = ((df_b.edits_3d/2)+1)/((df_b.edits_7d/4)+1)

df_b['change_in_edits_v2_1_3'] = ((df_b.edits_1d/1)+1)/((df_b.edits_3d/2)+1)

In [20]:
df_b['change_in_edits_v2_7_30'].describe()

count    51898.000000
mean         1.033275
std          0.702587
min          0.045365
25%          1.000000
50%          1.000000
75%          1.000000
max        120.250000
Name: change_in_edits_v2_7_30, dtype: float64

In [23]:
# Minor edits

# Version 1

df_b['change_in_minor_edits_v1_7_30'] = df_b.minor_edits_7d/4 - df_b.minor_edits_30d/23

df_b['change_in_minor_edits_v1_3_7'] = df_b.minor_edits_3d/2 - df_b.minor_edits_7d/4

df_b['change_in_minor_edits_v1_1_3'] = df_b.minor_edits_1d/1 - df_b.minor_edits_3d/2

# Version 2

df_b['change_in_minor_edits_v2_7_30'] = ((df_b.minor_edits_7d/4)+1)/((df_b.minor_edits_30d/23)+1)

df_b['change_in_minor_edits_v2_3_7'] = ((df_b.minor_edits_3d/2)+1)/((df_b.minor_edits_7d/4)+1)

df_b['change_in_minor_edits_v2_1_3'] = ((df_b.minor_edits_1d/1)+1)/((df_b.minor_edits_3d/2)+1)

In [24]:
df_b['change_in_minor_edits_v2_7_30'].describe()

count    51898.000000
mean         1.005038
std          0.143027
min          0.171642
25%          1.000000
50%          1.000000
75%          1.000000
max         12.250000
Name: change_in_minor_edits_v2_7_30, dtype: float64

In [26]:
# Average size

# Version 1

df_b['change_in_avg_size_v1_7_30'] = df_b[['avg_size_7d', 'avg_size_30d', 'edits_7d', 'edits_30d']].apply(
    lambda x: 0
    if (x['edits_7d'] + x['edits_30d'] == 0)
    else (x['avg_size_7d']/4 - x['avg_size_30d']/23), 
    axis=1
)

df_b['change_in_avg_size_v1_3_7'] = df_b[['avg_size_3d', 'avg_size_7d', 'edits_3d', 'edits_7d']].apply(
    lambda x: 0
    if (x['edits_3d'] + x['edits_7d'] == 0)
    else (x['avg_size_3d']/2 - x['avg_size_7d']/4), 
    axis=1
)

df_b['change_in_avg_size_v1_1_3'] = df_b[['avg_size_1d', 'avg_size_3d', 'edits_1d', 'edits_3d']].apply(
    lambda x: 0
    if (x['edits_1d'] + x['edits_3d'] == 0)
    else (x['avg_size_1d']/1 - x['avg_size_3d']/2), 
    axis=1
)

# Version 2

df_b['change_in_avg_size_v2_7_30'] = df_b[['avg_size_7d', 'avg_size_30d', 'edits_7d', 'edits_30d']].apply(
    lambda x: 0
    if (x['edits_7d'] + x['edits_30d'] == 0)
    else (((x['avg_size_7d']/4)+1)/((x['avg_size_30d']/23)+1)), 
    axis=1
)

df_b['change_in_avg_size_v2_3_7'] = df_b[['avg_size_3d', 'avg_size_7d', 'edits_3d', 'edits_7d']].apply(
    lambda x: 0
    if (x['edits_3d'] + x['edits_7d'] == 0)
    else (((x['avg_size_3d']/2)+1)/((x['avg_size_7d']/4)+1)), 
    axis=1
)

df_b['change_in_avg_size_v2_1_3'] = df_b[['avg_size_1d', 'avg_size_3d', 'edits_1d', 'edits_3d']].apply(
    lambda x: 0
    if (x['edits_1d'] + x['edits_3d'] == 0)
    else (((x['avg_size_1d']/1)+1)/((x['avg_size_3d']/2)+1)), 
    axis=1
)

In [29]:
# Talk views

df_b['change_in_talk_views_v1_7_30'] = df_b.talk_views_7d/4 - df_b.talk_views_30d/23

df_b['change_in_talk_views_v1_3_7'] = df_b.talk_views_3d/2 - df_b.talk_views_7d/4 

df_b['change_in_talk_views_v1_1_3'] = df_b.talk_views_1d/1 - df_b.talk_views_3d/2

# Version 2

df_b['change_in_talk_views_v2_7_30'] = ((df_b.talk_views_7d/4)+1)/((df_b.talk_views_30d/23)+1)

df_b['change_in_talk_views_v2_3_7'] = ((df_b.talk_views_3d/2)+1)/((df_b.talk_views_7d/4)+1)

df_b['change_in_talk_views_v2_1_3'] = ((df_b.talk_views_1d/1)+1)/((df_b.talk_views_3d/2)+1)

In [30]:
df_b['change_in_talk_views_v2_7_30'].describe()

count    51898.000000
mean         0.993708
std          0.304782
min          0.036918
25%          0.851852
50%          1.000000
75%          1.064815
max         28.097938
Name: change_in_talk_views_v2_7_30, dtype: float64

In [32]:
# Talk edits

# Version 1

df_b['change_in_talk_edits_v1_7_30'] = df_b.talk_edits_7d/4 - df_b.talk_edits_30d/23

df_b['change_in_talk_edits_v1_3_7'] = df_b.talk_edits_3d/2 - df_b.talk_edits_7d/4

df_b['change_in_talk_edits_v1_1_3'] = df_b.talk_edits_1d/1 - df_b.talk_edits_3d/2

# Version 2

df_b['change_in_talk_edits_v2_7_30'] = ((df_b.talk_edits_7d/4)+1)/((df_b.talk_edits_30d/23)+1)

df_b['change_in_talk_edits_v2_3_7'] = ((df_b.talk_edits_3d/2)+1)/((df_b.talk_edits_7d/4)+1)

df_b['change_in_talk_edits_v2_1_3'] = ((df_b.talk_edits_1d/1)+1)/((df_b.talk_edits_3d/2)+1)

In [33]:
df_b['change_in_talk_edits_v2_7_30'].describe()

count    51898.000000
mean         1.001752
std          0.089904
min          0.056235
25%          1.000000
50%          1.000000
75%          1.000000
max          8.510000
Name: change_in_talk_edits_v2_7_30, dtype: float64

In [34]:
# Talk minor edits

# Version 1

df_b['change_in_talk_minor_edits_v1_7_30'] = df_b.talk_minor_edits_7d/4 - df_b.talk_minor_edits_30d/23

df_b['change_in_talk_minor_edits_v1_3_7'] = df_b.talk_minor_edits_3d/2 - df_b.talk_minor_edits_7d/4

df_b['change_in_talk_minor_edits_v1_1_3'] = df_b.talk_minor_edits_1d/1 - df_b.talk_minor_edits_3d/2

# Version 2

df_b['change_in_talk_minor_edits_v2_7_30'] = ((df_b.talk_minor_edits_7d/4)+1)/((df_b.talk_minor_edits_30d/23)+1)

df_b['change_in_talk_minor_edits_v2_3_7'] = ((df_b.talk_minor_edits_3d/2)+1)/((df_b.talk_minor_edits_7d/4)+1)

df_b['change_in_talk_minor_edits_v2_1_3'] = ((df_b.talk_minor_edits_1d/1)+1)/((df_b.talk_minor_edits_3d/2)+1)

In [35]:
df_b['change_in_talk_minor_edits_v2_7_30'].describe()

count    51898.000000
mean         0.999938
std          0.019777
min          0.547619
25%          1.000000
50%          1.000000
75%          1.000000
max          2.750000
Name: change_in_talk_minor_edits_v2_7_30, dtype: float64

In [36]:
# Talk average size

# Version 1

df_b['change_in_talk_avg_size_v1_7_30'] = df_b[['talk_avg_size_7d', 'talk_avg_size_30d', 
                                                'talk_edits_7d', 'talk_edits_30d']].apply(
    lambda x: 0
    if (x['talk_edits_7d'] + x['talk_edits_30d'] == 0)
    else (x['talk_avg_size_7d']/4 - x['talk_avg_size_30d']/23), 
    axis=1
)

df_b['change_in_talk_avg_size_v1_3_7'] = df_b[['talk_avg_size_3d', 'talk_avg_size_7d', 
                                               'talk_edits_3d', 'talk_edits_7d']].apply(
    lambda x: 0
    if (x['talk_edits_3d'] + x['talk_edits_7d'] == 0)
    else (x['talk_avg_size_3d']/2 - x['talk_avg_size_7d']/4), 
    axis=1
)

df_b['change_in_talk_avg_size_v1_1_3'] = df_b[['talk_avg_size_1d', 'talk_avg_size_3d', 
                                               'talk_edits_1d', 'talk_edits_3d']].apply(
    lambda x: 0
    if (x['talk_edits_1d'] + x['talk_edits_3d'] == 0)
    else (x['talk_avg_size_1d']/1 - x['talk_avg_size_3d']/2), 
    axis=1
)

# Version 2

df_b['change_in_talk_avg_size_v2_7_30'] = df_b[['talk_avg_size_7d', 'talk_avg_size_30d', 
                                                'talk_edits_7d', 'talk_edits_30d']].apply(
    lambda x: 0
    if (x['talk_edits_7d'] + x['talk_edits_30d'] == 0)
    else (((x['talk_avg_size_7d']/4)+1)/((x['talk_avg_size_30d']/23)+1)), 
    axis=1
)

df_b['change_in_talk_avg_size_v2_3_7'] = df_b[['talk_avg_size_3d', 'talk_avg_size_7d', 
                                               'talk_edits_3d', 'talk_edits_7d']].apply(
    lambda x: 0
    if (x['talk_edits_3d'] + x['talk_edits_7d'] == 0)
    else (((x['talk_avg_size_3d']/2)+1)/((x['talk_avg_size_7d']/4)+1)), 
    axis=1
)

df_b['change_in_talk_avg_size_v2_1_3'] = df_b[['talk_avg_size_1d', 'talk_avg_size_3d', 
                                               'talk_edits_1d', 'talk_edits_3d']].apply(
    lambda x: 0
    if (x['talk_edits_1d'] + x['talk_edits_3d'] == 0)
    else (((x['talk_avg_size_1d']/1)+1)/((x['talk_avg_size_3d']/2)+1)), 
    axis=1
)

In [37]:
df_b['change_in_talk_avg_size_v2_1_3'].describe()

count    51656.000000
mean         0.017919
std          0.289071
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         25.786123
Name: change_in_talk_avg_size_v2_1_3, dtype: float64

In [38]:
# Check missingness in balanced data with new features

print(df_b.isnull().sum())

article_name                             0
num_edits                                0
views_30d                                0
views_7d                                 0
views_3d                                 0
views_1d                                 0
edits_30d                                0
edits_7d                                 0
edits_3d                                 0
edits_1d                                 0
minor_edits_30d                          0
minor_edits_7d                           0
minor_edits_3d                           0
minor_edits_1d                           0
avg_size_30d                           331
avg_size_7d                           4693
avg_size_3d                           6052
avg_size_1d                           6771
avg_size                                 0
latest_size                              0
talk_views_30d                           0
talk_views_7d                            0
talk_views_3d                            0
talk_views_

In [39]:
df_b

Unnamed: 0,article_name,num_edits,views_30d,views_7d,views_3d,views_1d,edits_30d,edits_7d,edits_3d,edits_1d,...,change_in_talk_minor_edits_v1_1_3,change_in_talk_minor_edits_v2_7_30,change_in_talk_minor_edits_v2_3_7,change_in_talk_minor_edits_v2_1_3,change_in_talk_avg_size_v1_7_30,change_in_talk_avg_size_v1_3_7,change_in_talk_avg_size_v1_1_3,change_in_talk_avg_size_v2_7_30,change_in_talk_avg_size_v2_3_7,change_in_talk_avg_size_v2_1_3
24625461,Jano_Toussounian,3.0,125.0,11.0,10.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.000000,1.0,1.0,0.000000,0.0000,0.0,0.000000,0.000000,0.000000
48635,2014%E2%80%9315_Acad%C3%A9mico/83_season,0.0,67.0,13.0,6.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.000000,1.0,1.0,0.000000,0.0000,0.0,0.000000,0.000000,0.000000
19619262,Dickin_Medal,1.0,13456.0,855.0,496.0,182.0,9.0,0.0,0.0,0.0,...,0.0,1.250000,0.8,1.0,8768.416149,422.5000,0.0,5.867060,1.039972,0.000000
383982,Arsaber,1.0,106.0,13.0,8.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.000000,1.0,1.0,0.000000,0.0000,0.0,0.000000,0.000000,0.000000
48469130,Rajkumar_Singhajit_Singh,1.0,306.0,54.0,27.0,3.0,1.0,0.0,0.0,0.0,...,0.0,1.000000,1.0,1.0,0.000000,0.0000,0.0,0.000000,0.000000,0.000000
43047188,Nemophora_askoldela,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.000000,1.0,1.0,0.000000,0.0000,0.0,0.000000,0.000000,0.000000
24861229,Italy_national_racquetball_team,2.0,48.0,9.0,5.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.000000,1.0,1.0,0.000000,0.0000,0.0,0.000000,0.000000,0.000000
43215,2009_NSW_Premier_League_season,0.0,84.0,6.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.000000,1.0,1.0,0.000000,0.0000,0.0,0.000000,0.000000,0.000000
34226621,Macross_Delta,2.0,42218.0,8954.0,2448.0,1124.0,80.0,34.0,6.0,5.0,...,0.0,1.000000,1.0,1.0,0.000000,0.0000,0.0,0.000000,0.000000,0.000000
7204,%E5%8C%97%E4%BA%AC%E5%8F%98%E7%89%8C%E9%92%B1%...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.000000,1.0,1.0,0.000000,0.0000,0.0,0.000000,0.000000,0.000000


In [40]:
# Write balanced data (with new features) to csv

df_b.to_csv('balanced_set2_20160501.csv', sep=',')
#df_b.head()

In [41]:
# Read in balanced data (with new features)

df_b2 = pd.read_csv('balanced_set2_20160501.csv')
df_b2 = df_b2.set_index('Unnamed: 0')
df_b2.index.name = None
# df_b.head()

In [42]:
df_b2_ri = df_b2.reset_index()
df_b2_ri = df_b2_ri.rename(columns={'index': 'original_index'})
df_b2_ri.shape

(51898, 88)

In [43]:
# Store relevant variable names (including new features) in a list

new_feature_names = [x for x in df_b2_ri.columns 
                     if 'original_index' not in x
                     and 'article_name' not in x
                     and 'num_edits' not in x
                     and 'v2' not in x
                     and 'talk_avg_size' not in x
                     and 'talk_latest_size' not in x]

print(len(new_feature_names))

label_name = "num_edits_binary"

51


In [110]:
# Drop rows with NaN's 

df_b2_ri = df_b2_ri[new_feature_names + [label_name]]
df_b2_ri = df_b2_ri.dropna() 

print(df_b2_ri.shape)

df_b2_ri.isnull().sum()

(43607, 38)


views_30d                       0
views_7d                        0
views_3d                        0
views_1d                        0
edits_30d                       0
edits_7d                        0
edits_3d                        0
edits_1d                        0
avg_size_30d                    0
avg_size_7d                     0
avg_size_3d                     0
avg_size_1d                     0
avg_size                        0
latest_size                     0
talk_views_30d                  0
talk_views_7d                   0
talk_views_3d                   0
talk_views_1d                   0
talk_edits_30d                  0
talk_edits_7d                   0
talk_edits_3d                   0
talk_edits_1d                   0
change_in_views_v1_7_30         0
change_in_views_v1_3_7          0
change_in_views_v1_1_3          0
change_in_edits_v1_7_30         0
change_in_edits_v1_3_7          0
change_in_edits_v1_1_3          0
change_in_avg_size_v1_7_30      0
change_in_avg_

In [111]:
# Convert data frame to matrix

X = df_b2_ri[new_feature_names].as_matrix()

Y = df_b2_ri[label_name].as_matrix()

In [112]:
# Logistic regression

np.random.seed(seed=13579)

kf = KFold(n_splits=5, random_state=13579)

train_acc = []
test_acc = []

for train_idx, test_idx in kf.split(X):
    train_x = X[train_idx]
    train_y = Y[train_idx]
    test_x = X[test_idx]
    test_y = Y[test_idx]

    model = LogisticRegression()
    model.fit(train_x, train_y)
    
    train_acc += [accuracy_score(train_y, model.predict(train_x))]
    test_acc += [accuracy_score(test_y, model.predict(test_x))]

print("Train accuracy is: " + str(np.mean(train_acc)))
print("Test accuracy is: " + str(np.mean(test_acc)))

0.13702163298924763 minutes
Train accuracy is: 0.680911407167
Test accuracy is: 0.68011978824


In [None]:
# TBD LATER IF HAVE EXTRA TIME

# - Explore class weights
# - Explore different algorithms