In [50]:
# Import necessary libraries

import pandas as pd

import numpy as np

import time

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

from sklearn.model_selection import KFold, train_test_split

from sklearn.utils import shuffle

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Read in data

t0 = time.time()
df = pd.read_csv('aggregate-20160501.csv')
t1 = time.time()
print(str((t1-t0)/60) + " minutes")

6.257964468002319 minutes


In [3]:
#df.head()
#df.shape
#df.columns
#set(df.num_edits)

In [4]:
# Calculate percentage of unedited articles in original imbalanced data

len(df["num_edits"][df["num_edits"] == 0])/len(df["num_edits"])*100

99.91720181519122

In [5]:
# Check missingness in original imbalanced data

print(df.isnull().sum())

article_name                   1
num_edits                      0
views_30d                      0
views_7d                       0
views_3d                       0
views_1d                       0
edits_30d                      0
edits_7d                       0
edits_3d                       0
edits_1d                       0
minor_edits_30d                0
minor_edits_7d                 0
minor_edits_3d                 0
minor_edits_1d                 0
avg_size_30d             6428583
avg_size_7d             45268487
avg_size_3d             49800824
avg_size_1d             53028939
avg_size                  209968
latest_size               209968
talk_views_30d                 0
talk_views_7d                  0
talk_views_3d                  0
talk_views_1d                  0
talk_edits_30d                 0
talk_edits_7d                  0
talk_edits_3d                  0
talk_edits_1d                  0
talk_minor_edits_30d           0
talk_minor_edits_7d            0
talk_minor

In [6]:
# Generate binary edited vs. unedited variable

df['num_edits_binary'] = df['num_edits'].apply(lambda x: int(x > 0))
#set(df['num_edits_binary'])

In [7]:
# Store relevant variable names in a list

feature_names = [x for x in df.columns if 'num_edits' not in x and 'article_name' not in x]

label_name = "num_edits_binary"

feature_names

['views_30d',
 'views_7d',
 'views_3d',
 'views_1d',
 'edits_30d',
 'edits_7d',
 'edits_3d',
 'edits_1d',
 'minor_edits_30d',
 'minor_edits_7d',
 'minor_edits_3d',
 'minor_edits_1d',
 'avg_size_30d',
 'avg_size_7d',
 'avg_size_3d',
 'avg_size_1d',
 'avg_size',
 'latest_size',
 'talk_views_30d',
 'talk_views_7d',
 'talk_views_3d',
 'talk_views_1d',
 'talk_edits_30d',
 'talk_edits_7d',
 'talk_edits_3d',
 'talk_edits_1d',
 'talk_minor_edits_30d',
 'talk_minor_edits_7d',
 'talk_minor_edits_3d',
 'talk_minor_edits_1d',
 'talk_avg_size_30d',
 'talk_avg_size_7d',
 'talk_avg_size_3d',
 'talk_avg_size_1d',
 'talk_avg_size',
 'talk_latest_size']

In [8]:
## Generate balanced sample

t0 = time.time()

np.random.seed(seed=13579)

# to be set aside as imbalanced test data
set1_idx = np.random.choice(range(len(df)), int(len(df) * .2), replace=False)
set1_X = df.loc[set1_idx, feature_names]
set1_Y = df.loc[set1_idx, label_name]

# to be used to generate balanced data for main analysis
set2_idx = list(set(range(len(df))) - set(list(set1_idx)))
set2 = df.loc[set2_idx, list(df.columns.tolist())]

edited = set2[set2.num_edits > 0.0].copy(deep=True)
#edited.shape

not_edited = set2[set2.num_edits == 0.0].copy(deep=True)

not_edited_selected = not_edited[0:edited.shape[0]]

balanced_set = pd.concat([edited, not_edited_selected])

balanced_set = shuffle(balanced_set)
#balanced_set.shape

t1 = time.time()
print(str((t1-t0)/60) + " minutes")

4.955658749739329 minutes


In [9]:
print(edited.shape)
print(balanced_set.shape)

(41624, 39)
(83248, 39)


In [10]:
# Write balanced data to csv

balanced_set.to_csv('balanced_set_20160501.csv', sep=',')
#balanced_set.head()

In [11]:
# Read in balanced data

df_b = pd.read_csv('balanced_set_20160501.csv')
df_b = df_b.set_index('Unnamed: 0')
df_b.index.name = None
#df_b.head()

In [12]:
# Calculate percentage of unedited articles in balanced data

len(df_b["num_edits_binary"][df_b["num_edits_binary"] == 0])/len(df_b["num_edits_binary"])*100

50.0

In [13]:
# Check missingness in balanced data

print(df_b.isnull().sum())

article_name                0
num_edits                   0
views_30d                   0
views_7d                    0
views_3d                    0
views_1d                    0
edits_30d                   0
edits_7d                    0
edits_3d                    0
edits_1d                    0
minor_edits_30d             0
minor_edits_7d              0
minor_edits_3d              0
minor_edits_1d              0
avg_size_30d              496
avg_size_7d              7505
avg_size_3d              9725
avg_size_1d             10885
avg_size                    1
latest_size                 1
talk_views_30d              0
talk_views_7d               0
talk_views_3d               0
talk_views_1d               0
talk_edits_30d              0
talk_edits_7d               0
talk_edits_3d               0
talk_edits_1d               0
talk_minor_edits_30d        0
talk_minor_edits_7d         0
talk_minor_edits_3d         0
talk_minor_edits_1d         0
talk_avg_size_30d       17892
talk_avg_s

In [14]:
# Generate features

# Views

df_b['total_views'] = df_b.views_1d + df_b.views_3d + df_b.views_7d + df_b.views_30d

# Version 1

df_b['change_in_views_v1_7_30'] = df_b.views_7d/4 - df_b.views_30d/23

df_b['change_in_views_v1_3_7'] = df_b.views_3d/2 - df_b.views_7d/4 

df_b['change_in_views_v1_1_3'] = df_b.views_1d/1 - df_b.views_3d/2

# Version 2

df_b['change_in_views_v2_7_30'] = ((df_b.views_7d/4)+1)/((df_b.views_30d/23)+1)

df_b['change_in_views_v2_3_7'] = ((df_b.views_3d/2)+1)/((df_b.views_7d/4)+1)

df_b['change_in_views_v2_1_3'] = ((df_b.views_1d/1)+1)/((df_b.views_3d/2)+1)

#df_b['total_views'].describe()

In [16]:
# Edits

df_b['total_edits'] = df_b.edits_1d + df_b.edits_3d + df_b.edits_7d + df_b.edits_30d

# Version 1

df_b['change_in_edits_v1_7_30'] = df_b.edits_7d/4 - df_b.edits_30d/23

df_b['change_in_edits_v1_3_7'] = df_b.edits_3d/2 - df_b.edits_7d/4

df_b['change_in_edits_v1_1_3'] = df_b.edits_1d/1 - df_b.edits_3d/2

# Version 2

df_b['change_in_edits_v2_7_30'] = ((df_b.edits_7d/4)+1)/((df_b.edits_30d/23)+1)

df_b['change_in_edits_v2_3_7'] = ((df_b.edits_3d/2)+1)/((df_b.edits_7d/4)+1)

df_b['change_in_edits_v2_1_3'] = ((df_b.edits_1d/1)+1)/((df_b.edits_3d/2)+1)

#df_b['total_edits'].describe()

In [18]:
# Minor edits

df_b['total_minor_edits'] = df_b.minor_edits_1d + df_b.minor_edits_3d + df_b.minor_edits_7d + df_b.minor_edits_30d

# Version 1

df_b['change_in_minor_edits_v1_7_30'] = df_b.minor_edits_7d/4 - df_b.minor_edits_30d/23

df_b['change_in_minor_edits_v1_3_7'] = df_b.minor_edits_3d/2 - df_b.minor_edits_7d/4

df_b['change_in_minor_edits_v1_1_3'] = df_b.minor_edits_1d/1 - df_b.minor_edits_3d/2

# Version 2

df_b['change_in_minor_edits_v2_7_30'] = ((df_b.minor_edits_7d/4)+1)/((df_b.minor_edits_30d/23)+1)

df_b['change_in_minor_edits_v2_3_7'] = ((df_b.minor_edits_3d/2)+1)/((df_b.minor_edits_7d/4)+1)

df_b['change_in_minor_edits_v2_1_3'] = ((df_b.minor_edits_1d/1)+1)/((df_b.minor_edits_3d/2)+1)

#df_b['total_minor_edits'].describe()

In [19]:
# Average size

# Version 1

df_b['change_in_avg_size_v1_7_30'] = df_b[['avg_size_7d', 'avg_size_30d', 'edits_7d', 'edits_30d']].apply(
    lambda x: 0
    if (x['edits_7d'] + x['edits_30d'] == 0)
    else (x['avg_size_7d']/4 - x['avg_size_30d']/23), 
    axis=1
)

df_b['change_in_avg_size_v1_3_7'] = df_b[['avg_size_3d', 'avg_size_7d', 'edits_3d', 'edits_7d']].apply(
    lambda x: 0
    if (x['edits_3d'] + x['edits_7d'] == 0)
    else (x['avg_size_3d']/2 - x['avg_size_7d']/4), 
    axis=1
)

df_b['change_in_avg_size_v1_1_3'] = df_b[['avg_size_1d', 'avg_size_3d', 'edits_1d', 'edits_3d']].apply(
    lambda x: 0
    if (x['edits_1d'] + x['edits_3d'] == 0)
    else (x['avg_size_1d']/1 - x['avg_size_3d']/2), 
    axis=1
)

# Version 2

df_b['change_in_avg_size_v2_7_30'] = df_b[['avg_size_7d', 'avg_size_30d', 'edits_7d', 'edits_30d']].apply(
    lambda x: 0
    if (x['edits_7d'] + x['edits_30d'] == 0)
    else (((x['avg_size_7d']/4)+1)/((x['avg_size_30d']/23)+1)), 
    axis=1
)

df_b['change_in_avg_size_v2_3_7'] = df_b[['avg_size_3d', 'avg_size_7d', 'edits_3d', 'edits_7d']].apply(
    lambda x: 0
    if (x['edits_3d'] + x['edits_7d'] == 0)
    else (((x['avg_size_3d']/2)+1)/((x['avg_size_7d']/4)+1)), 
    axis=1
)

df_b['change_in_avg_size_v2_1_3'] = df_b[['avg_size_1d', 'avg_size_3d', 'edits_1d', 'edits_3d']].apply(
    lambda x: 0
    if (x['edits_1d'] + x['edits_3d'] == 0)
    else (((x['avg_size_1d']/1)+1)/((x['avg_size_3d']/2)+1)), 
    axis=1
)

#df_b['change_in_avg_size_v2_1_3'].describe()

In [20]:
# Talk views

df_b['total_talk_views'] = df_b.talk_views_1d + df_b.talk_views_3d + df_b.talk_views_7d + df_b.talk_views_30d

# Version 1

df_b['change_in_talk_views_v1_7_30'] = df_b.talk_views_7d/4 - df_b.talk_views_30d/23

df_b['change_in_talk_views_v1_3_7'] = df_b.talk_views_3d/2 - df_b.talk_views_7d/4 

df_b['change_in_talk_views_v1_1_3'] = df_b.talk_views_1d/1 - df_b.talk_views_3d/2

# Version 2

df_b['change_in_talk_views_v2_7_30'] = ((df_b.talk_views_7d/4)+1)/((df_b.talk_views_30d/23)+1)

df_b['change_in_talk_views_v2_3_7'] = ((df_b.talk_views_3d/2)+1)/((df_b.talk_views_7d/4)+1)

df_b['change_in_talk_views_v2_1_3'] = ((df_b.talk_views_1d/1)+1)/((df_b.talk_views_3d/2)+1)

#df_b['total_talk_views'].describe()

In [21]:
# Talk edits

df_b['total_talk_edits'] = df_b.talk_edits_1d + df_b.talk_edits_3d + df_b.talk_edits_7d + df_b.talk_edits_30d

# Version 1

df_b['change_in_talk_edits_v1_7_30'] = df_b.talk_edits_7d/4 - df_b.talk_edits_30d/23

df_b['change_in_talk_edits_v1_3_7'] = df_b.talk_edits_3d/2 - df_b.talk_edits_7d/4

df_b['change_in_talk_edits_v1_1_3'] = df_b.talk_edits_1d/1 - df_b.talk_edits_3d/2

# Version 2

df_b['change_in_talk_edits_v2_7_30'] = ((df_b.talk_edits_7d/4)+1)/((df_b.talk_edits_30d/23)+1)

df_b['change_in_talk_edits_v2_3_7'] = ((df_b.talk_edits_3d/2)+1)/((df_b.talk_edits_7d/4)+1)

df_b['change_in_talk_edits_v2_1_3'] = ((df_b.talk_edits_1d/1)+1)/((df_b.talk_edits_3d/2)+1)

#df_b['total_talk_edits'].describe()

In [22]:
# Talk minor edits

df_b['total_talk_minor_edits'] = (df_b.talk_minor_edits_1d + df_b.talk_minor_edits_3d + 
                                  df_b.talk_minor_edits_7d + df_b.talk_minor_edits_30d)


# Version 1

df_b['change_in_talk_minor_edits_v1_7_30'] = df_b.talk_minor_edits_7d/4 - df_b.talk_minor_edits_30d/23

df_b['change_in_talk_minor_edits_v1_3_7'] = df_b.talk_minor_edits_3d/2 - df_b.talk_minor_edits_7d/4

df_b['change_in_talk_minor_edits_v1_1_3'] = df_b.talk_minor_edits_1d/1 - df_b.talk_minor_edits_3d/2

# Version 2

df_b['change_in_talk_minor_edits_v2_7_30'] = ((df_b.talk_minor_edits_7d/4)+1)/((df_b.talk_minor_edits_30d/23)+1)

df_b['change_in_talk_minor_edits_v2_3_7'] = ((df_b.talk_minor_edits_3d/2)+1)/((df_b.talk_minor_edits_7d/4)+1)

df_b['change_in_talk_minor_edits_v2_1_3'] = ((df_b.talk_minor_edits_1d/1)+1)/((df_b.talk_minor_edits_3d/2)+1)

#df_b['total_talk_minor_edits'].describe()

In [23]:
# Talk average size

# Version 1

df_b['change_in_talk_avg_size_v1_7_30'] = df_b[['talk_avg_size_7d', 'talk_avg_size_30d', 
                                                'talk_edits_7d', 'talk_edits_30d']].apply(
    lambda x: 0
    if (x['talk_edits_7d'] + x['talk_edits_30d'] == 0)
    else (x['talk_avg_size_7d']/4 - x['talk_avg_size_30d']/23), 
    axis=1
)

df_b['change_in_talk_avg_size_v1_3_7'] = df_b[['talk_avg_size_3d', 'talk_avg_size_7d', 
                                               'talk_edits_3d', 'talk_edits_7d']].apply(
    lambda x: 0
    if (x['talk_edits_3d'] + x['talk_edits_7d'] == 0)
    else (x['talk_avg_size_3d']/2 - x['talk_avg_size_7d']/4), 
    axis=1
)

df_b['change_in_talk_avg_size_v1_1_3'] = df_b[['talk_avg_size_1d', 'talk_avg_size_3d', 
                                               'talk_edits_1d', 'talk_edits_3d']].apply(
    lambda x: 0
    if (x['talk_edits_1d'] + x['talk_edits_3d'] == 0)
    else (x['talk_avg_size_1d']/1 - x['talk_avg_size_3d']/2), 
    axis=1
)

# Version 2

df_b['change_in_talk_avg_size_v2_7_30'] = df_b[['talk_avg_size_7d', 'talk_avg_size_30d', 
                                                'talk_edits_7d', 'talk_edits_30d']].apply(
    lambda x: 0
    if (x['talk_edits_7d'] + x['talk_edits_30d'] == 0)
    else (((x['talk_avg_size_7d']/4)+1)/((x['talk_avg_size_30d']/23)+1)), 
    axis=1
)

df_b['change_in_talk_avg_size_v2_3_7'] = df_b[['talk_avg_size_3d', 'talk_avg_size_7d', 
                                               'talk_edits_3d', 'talk_edits_7d']].apply(
    lambda x: 0
    if (x['talk_edits_3d'] + x['talk_edits_7d'] == 0)
    else (((x['talk_avg_size_3d']/2)+1)/((x['talk_avg_size_7d']/4)+1)), 
    axis=1
)

df_b['change_in_talk_avg_size_v2_1_3'] = df_b[['talk_avg_size_1d', 'talk_avg_size_3d', 
                                               'talk_edits_1d', 'talk_edits_3d']].apply(
    lambda x: 0
    if (x['talk_edits_1d'] + x['talk_edits_3d'] == 0)
    else (((x['talk_avg_size_1d']/1)+1)/((x['talk_avg_size_3d']/2)+1)), 
    axis=1
)

#df_b['change_in_talk_avg_size_v2_1_3'].describe()

In [24]:
# Check missingness in balanced data with new features

print(df_b.isnull().sum())

article_name                              0
num_edits                                 0
views_30d                                 0
views_7d                                  0
views_3d                                  0
views_1d                                  0
edits_30d                                 0
edits_7d                                  0
edits_3d                                  0
edits_1d                                  0
minor_edits_30d                           0
minor_edits_7d                            0
minor_edits_3d                            0
minor_edits_1d                            0
avg_size_30d                            496
avg_size_7d                            7505
avg_size_3d                            9725
avg_size_1d                           10885
avg_size                                  1
latest_size                               1
talk_views_30d                            0
talk_views_7d                             0
talk_views_3d                   

In [25]:
df_b.head()

Unnamed: 0,article_name,num_edits,views_30d,views_7d,views_3d,views_1d,edits_30d,edits_7d,edits_3d,edits_1d,...,change_in_talk_minor_edits_v1_1_3,change_in_talk_minor_edits_v2_7_30,change_in_talk_minor_edits_v2_3_7,change_in_talk_minor_edits_v2_1_3,change_in_talk_avg_size_v1_7_30,change_in_talk_avg_size_v1_3_7,change_in_talk_avg_size_v1_1_3,change_in_talk_avg_size_v2_7_30,change_in_talk_avg_size_v2_3_7,change_in_talk_avg_size_v2_1_3
1485,%C3%84%C3%A4nekoski,0.0,738.0,81.0,35.0,27.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
34597,1997_Pulitzer_Prize,0.0,259.0,49.0,27.0,13.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
12739077,Bella_Hardy,1.0,334.0,44.0,15.0,17.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
47522926,Three_Hundred_Big_Boys,3.0,946.0,133.0,82.0,32.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
38096,2002_Tangerine_Bowl,0.0,113.0,20.0,6.0,10.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Write balanced data (with new features) to csv

df_b.to_csv('balanced_set2_20160501.csv', sep=',')
#df_b.head()

In [27]:
# Read in balanced data (with new features)

df_b2 = pd.read_csv('balanced_set2_20160501.csv')
df_b2 = df_b2.set_index('Unnamed: 0')
df_b2.index.name = None
# df_b.head()

In [28]:
df_b2_ri = df_b2.reset_index()
df_b2_ri = df_b2_ri.rename(columns={'index': 'original_index'})
df_b2_ri.shape

(83248, 94)

In [29]:
# Store relevant variable names (including new features) in a list

new_feature_names = [x for x in df_b2_ri.columns 
                     if 'original_index' not in x
                     and 'article_name' not in x
                     and 'num_edits' not in x
                     and 'views_30d' not in x
                     and 'views_7d' not in x
                     and 'views_3d' not in x
                     and 'views_1d' not in x
                     and 'edits_30d' not in x
                     and 'edits_7d' not in x
                     and 'edits_3d' not in x
                     and 'edits_1d' not in x
                     and 'minor_edits_30d' not in x
                     and 'minor_edits_7d' not in x
                     and 'minor_edits_3d' not in x
                     and 'minor_edits_1d' not in x
                     and 'avg_size_30d' not in x
                     and 'avg_size_7d' not in x
                     and 'avg_size_3d' not in x
                     and 'avg_size_1d' not in x
                     and 'talk_views_30d' not in x
                     and 'talk_views_7d' not in x
                     and 'talk_views_3d' not in x
                     and 'talk_views_1d' not in x
                     and 'talk_edits_30d' not in x
                     and 'talk_edits_7d' not in x
                     and 'talk_edits_3d' not in x
                     and 'talk_edits_1d' not in x
                     and 'talk_minor_edits_30d' not in x
                     and 'talk_minor_edits_7d' not in x
                     and 'talk_minor_edits_3d' not in x
                     and 'talk_minor_edits_1d' not in x
                     and 'talk_avg_size_30d' not in x
                     and 'talk_avg_size_7d' not in x
                     and 'talk_avg_size_3d' not in x
                     and 'talk_avg_size_1d' not in x
                     and 'talk_avg_size' not in x
                     and 'talk_latest_size' not in x]

print(len(new_feature_names))

label_name = "num_edits_binary"

50


In [30]:
# Drop rows with NaN's 

df_b2_ri = df_b2_ri[new_feature_names + [label_name]]
df_b2_ri = df_b2_ri.dropna() 

print(df_b2_ri.shape)

df_b2_ri.isnull().sum()

(82461, 51)


avg_size                              0
latest_size                           0
total_views                           0
change_in_views_v1_7_30               0
change_in_views_v1_3_7                0
change_in_views_v1_1_3                0
change_in_views_v2_7_30               0
change_in_views_v2_3_7                0
change_in_views_v2_1_3                0
total_edits                           0
change_in_edits_v1_7_30               0
change_in_edits_v1_3_7                0
change_in_edits_v1_1_3                0
change_in_edits_v2_7_30               0
change_in_edits_v2_3_7                0
change_in_edits_v2_1_3                0
total_minor_edits                     0
change_in_minor_edits_v1_7_30         0
change_in_minor_edits_v1_3_7          0
change_in_minor_edits_v1_1_3          0
change_in_minor_edits_v2_7_30         0
change_in_minor_edits_v2_3_7          0
change_in_minor_edits_v2_1_3          0
change_in_avg_size_v1_7_30            0
change_in_avg_size_v1_3_7             0


In [31]:
print(len(df_b2_ri[df_b2_ri.num_edits_binary == 0])) # number of unedited articles
print(len(df_b2_ri[df_b2_ri.num_edits_binary != 0])) # number of edited articles

41613
40848


In [32]:
# Convert data frame to matrix

X = df_b2_ri[new_feature_names].as_matrix()

Y = df_b2_ri[label_name].as_matrix()

In [33]:
# Correlation matrix

corr_table = df_b2_ri[[label_name] + new_feature_names].corr(method='pearson')
corr_table.style.background_gradient(cmap='RdYlGn', axis = 0)

Unnamed: 0,num_edits_binary,avg_size,latest_size,total_views,change_in_views_v1_7_30,change_in_views_v1_3_7,change_in_views_v1_1_3,change_in_views_v2_7_30,change_in_views_v2_3_7,change_in_views_v2_1_3,total_edits,change_in_edits_v1_7_30,change_in_edits_v1_3_7,change_in_edits_v1_1_3,change_in_edits_v2_7_30,change_in_edits_v2_3_7,change_in_edits_v2_1_3,total_minor_edits,change_in_minor_edits_v1_7_30,change_in_minor_edits_v1_3_7,change_in_minor_edits_v1_1_3,change_in_minor_edits_v2_7_30,change_in_minor_edits_v2_3_7,change_in_minor_edits_v2_1_3,change_in_avg_size_v1_7_30,change_in_avg_size_v1_3_7,change_in_avg_size_v1_1_3,change_in_avg_size_v2_7_30,change_in_avg_size_v2_3_7,change_in_avg_size_v2_1_3,total_talk_views,change_in_talk_views_v1_7_30,change_in_talk_views_v1_3_7,change_in_talk_views_v1_1_3,change_in_talk_views_v2_7_30,change_in_talk_views_v2_3_7,change_in_talk_views_v2_1_3,total_talk_edits,change_in_talk_edits_v1_7_30,change_in_talk_edits_v1_3_7,change_in_talk_edits_v1_1_3,change_in_talk_edits_v2_7_30,change_in_talk_edits_v2_3_7,change_in_talk_edits_v2_1_3,total_talk_minor_edits,change_in_talk_minor_edits_v1_7_30,change_in_talk_minor_edits_v1_3_7,change_in_talk_minor_edits_v1_1_3,change_in_talk_minor_edits_v2_7_30,change_in_talk_minor_edits_v2_3_7,change_in_talk_minor_edits_v2_1_3
num_edits_binary,1.0,0.0498531,0.0687159,0.13694,-0.00511299,-0.00091231,-0.0561462,0.0251104,0.0269783,0.0124445,0.140217,0.031488,0.018702,0.0293332,0.0565239,0.0625173,0.0915304,0.107453,0.0216708,0.094946,-0.0851202,0.0300587,0.159128,-0.0898985,0.0525817,0.0477144,0.055735,0.0181424,0.00515584,0.00797928,0.119438,0.00471641,-0.00477985,-0.0142876,0.0248212,-0.000205818,0.016241,0.0390362,-0.00222961,0.0145301,-0.000479904,0.00786015,0.0188221,0.0198982,0.0665643,-0.00531788,0.00609963,0.00100401,-0.00233887,0.0107541,0.010888
avg_size,0.0498531,1.0,0.767091,0.713269,-0.26693,-0.0370166,-0.320638,-0.00116404,-0.000211041,-0.00531172,0.138686,0.0146559,-0.0200003,0.0043052,0.0287583,0.0162545,0.0080156,0.152364,0.0163439,0.141268,-0.14115,0.0353214,0.145032,-0.079456,0.385616,0.589772,0.634014,0.00156094,0.00011649,5.38519e-05,0.489903,-0.0355878,0.0949435,-0.162864,0.0194963,0.000265931,-0.0115628,0.230812,0.0449215,0.0897628,-0.0044226,0.0670675,0.0551734,0.029598,0.314692,0.0192512,0.000484806,0.0245645,0.0271232,0.0332687,0.0420113
latest_size,0.0687159,0.767091,1.0,0.764012,0.0136327,-0.0670113,-0.434861,0.0099808,0.0246225,0.00370045,0.204274,0.0601617,-0.0107059,0.0322021,0.0671846,0.0384129,0.0329619,0.195103,0.0501317,0.172864,-0.170085,0.0708159,0.174723,-0.083104,0.787232,0.824766,0.745241,0.00542218,0.000445847,6.70518e-05,0.566027,0.00887418,0.112284,-0.243738,0.0630376,0.0100087,-0.0058187,0.291655,0.0447302,0.097997,-0.0753316,0.0722793,0.0473595,0.0232139,0.402397,0.0140175,-0.0149939,-0.00318905,0.0207766,0.0231614,0.0264514
total_views,0.13694,0.713269,0.764012,1.0,-0.189402,-0.152706,-0.443484,0.0182749,0.00337958,-0.0139822,0.431281,-0.0313781,-0.0630791,0.0284614,0.0689604,0.0373399,0.0330557,0.430171,-0.0357595,0.418448,-0.411901,0.0543435,0.395289,-0.171614,0.569583,0.572117,0.579081,0.00933277,0.000371726,-0.000593128,0.693022,-0.147286,0.022108,-0.168367,0.0684971,0.00165865,-0.010654,0.356031,-0.0540388,0.052755,-0.00769439,0.0594806,0.0473871,0.0368965,0.447254,-0.0130495,-0.0243829,0.0123469,0.00454636,0.0184542,0.045256
change_in_views_v1_7_30,-0.00511299,-0.26693,0.0136327,-0.189402,1.0,-0.577284,-0.190745,0.129781,-0.00198388,0.000108635,-0.194096,0.34362,0.00356661,-0.0618799,0.221598,-0.00732403,-0.00722339,-0.258141,0.305867,-0.319646,0.296271,0.226146,-0.202958,0.0608865,0.445445,-0.114703,-0.0753996,0.0231054,-1.5048e-05,-4.3102e-05,-0.236719,0.457801,-0.0757262,-0.0405512,0.240823,-0.00505766,0.000309746,-0.178416,0.173018,0.0174103,-0.0858994,0.107309,-0.00812734,-0.0322524,-0.141231,0.0812702,0.0818066,-0.0844538,0.0700878,0.0490183,-0.0572492
change_in_views_v1_3_7,-0.00091231,-0.0370166,-0.0670113,-0.152706,-0.577284,1.0,-0.152887,-0.0726629,0.144058,-0.000854332,-0.036517,-0.159235,0.212336,-0.0397934,-0.15096,0.181511,-0.00752669,-0.0369704,-0.145118,-0.00321546,0.0266204,-0.156622,-0.00411439,0.0141496,-0.378327,0.250795,-0.138811,-0.012586,0.00362261,8.99969e-07,-0.0267767,-0.164202,0.218979,-0.00774774,-0.167788,0.127738,-0.00208242,-0.0060116,-0.037158,0.000651843,0.0207452,-0.0643127,0.00855418,0.0101088,-0.0202556,-0.0301767,-0.0345655,0.0309015,-0.0300479,-0.0240565,0.0076949
change_in_views_v1_1_3,-0.0561462,-0.320638,-0.434861,-0.443484,-0.190745,-0.152887,1.0,-0.0931411,-0.112764,0.10248,-0.121775,-0.113947,-0.0875276,0.153922,-0.123289,-0.14773,0.090887,-0.118142,-0.0874293,-0.0922189,0.10049,-0.10293,-0.119923,0.0793287,-0.485047,-0.562181,-0.0285647,-0.0198117,-0.00317839,0.00272066,-0.344247,-0.0798193,-0.138726,0.235449,-0.120535,-0.0976884,0.0485646,-0.165164,-0.0409229,-0.0649141,0.0755765,-0.0774524,-0.0356813,0.0105257,-0.239594,-0.0306173,-0.0251568,0.0516708,-0.0393435,-0.0413711,0.0208262
change_in_views_v2_7_30,0.0251104,-0.00116404,0.0099808,0.0182749,0.129781,-0.0726629,-0.0931411,1.0,-0.00363427,-0.00153999,0.0363527,0.169351,-0.0316153,-0.0151659,0.243569,-0.00148842,0.00556336,0.0188003,0.158966,-0.0207717,0.0118252,0.181862,-0.0127996,0.0551495,0.0323151,0.00328501,0.00184536,0.116966,7.4807e-05,-0.000201234,0.00499422,0.0557501,-0.0148833,-0.00802203,0.161282,-0.0047008,-0.00296397,0.00495597,0.0299444,-0.00822556,-0.00715813,0.0535873,-0.0065044,-0.00415695,0.00837235,0.0257623,-0.000953214,-0.00916371,0.0281338,-0.000342168,-0.00549962
change_in_views_v2_3_7,0.0269783,-0.000211041,0.0246225,0.00337958,-0.00198388,0.144058,-0.112764,-0.00363427,1.0,-0.00511675,0.0224628,-0.00444057,0.187396,-0.0554036,-0.00357819,0.290157,-0.00432039,0.00362507,-0.0034119,0.000480829,0.0174946,-0.00327391,0.00196382,0.0871617,-0.00283356,0.0301369,0.0174161,-0.000475575,0.0151913,-0.000839547,0.00383513,-0.000500534,0.104008,-0.0292396,0.0021187,0.1609,-0.00164736,0.00430278,0.000355589,0.0201375,-0.0111149,0.00137397,0.0235122,-0.00224196,0.00355789,-0.000605626,0.00924436,-0.001421,-0.000485938,0.0104268,0.0013734
change_in_views_v2_1_3,0.0124445,-0.00531172,0.00370045,-0.0139822,0.000108635,-0.000854332,0.10248,-0.00153999,-0.00511675,1.0,0.00423676,-0.00324466,-0.00325073,0.134452,-0.0049807,-0.00538876,0.268828,-0.00272273,-2.47415e-05,-0.00724101,0.0289273,-0.000420431,-0.0114917,0.123947,-0.00567176,-0.0056088,0.0255623,-0.00129415,-0.000776367,0.0166904,-0.0146042,-0.000538902,-0.000639825,0.0463446,-0.00412001,-0.00199818,0.101419,-0.00110485,-0.000400494,-0.000582458,0.0453412,-0.00237781,-0.000900987,0.0850659,-0.00433709,0.000785754,-0.0017572,0.0158289,0.000554753,-0.00222036,0.0188519


In [34]:
# Logistic regression

np.random.seed(seed=13579)

kf = KFold(n_splits=10, random_state=13579)

train_acc = []
test_acc = []

for train_idx, test_idx in kf.split(X):
    train_x = X[train_idx]
    train_y = Y[train_idx]
    test_x = X[test_idx]
    test_y = Y[test_idx]

    logit = LogisticRegression()
    logit.fit(train_x, train_y)
    
    train_acc += [accuracy_score(train_y, logit.predict(train_x))]
    test_acc += [accuracy_score(test_y, logit.predict(test_x))]

print("Train accuracy is: " + str(np.mean(train_acc)))
print("Test accuracy is: " + str(np.mean(test_acc)))

Train accuracy is: 0.690230668224
Test accuracy is: 0.690011087753


In [37]:
# Decision tree

np.random.seed(seed=13579)

kf = KFold(n_splits=10, random_state=13579)

train_acc = []
test_acc = []

for train_idx, test_idx in kf.split(X):
    train_x = X[train_idx]
    train_y = Y[train_idx]
    test_x = X[test_idx]
    test_y = Y[test_idx]

    tree = DecisionTreeClassifier(random_state=13579)
    tree.fit(train_x, train_y)
    
    train_acc += [accuracy_score(train_y, tree.predict(train_x))]
    test_acc += [accuracy_score(test_y, tree.predict(test_x))]

print("Train accuracy is: " + str(np.mean(train_acc)))
print("Test accuracy is: " + str(np.mean(test_acc)))

Train accuracy is: 0.998661993774
Test accuracy is: 0.714495348135


In [49]:
# LASSO

np.random.seed(seed=13579)

kf = KFold(n_splits=10, random_state=13579)

train_acc = []
test_acc = []

for train_idx, test_idx in kf.split(X):
    train_x = X[train_idx]
    train_y = Y[train_idx]
    test_x = X[test_idx]
    test_y = Y[test_idx]

    lasso = LogisticRegression(penalty = 'l1')
    lasso.fit(train_x, train_y)
    
    train_acc += [accuracy_score(train_y, lasso.predict(train_x))]
    test_acc += [accuracy_score(test_y, lasso.predict(test_x))]

print("Train accuracy is: " + str(np.mean(train_acc)))
print("Test accuracy is: " + str(np.mean(test_acc)))

Train accuracy is: 0.656198423295
Test accuracy is: 0.656395160974


In [40]:
# Random forest

np.random.seed(seed=13579)

kf = KFold(n_splits=10, random_state=13579)

train_acc = []
test_acc = []

for train_idx, test_idx in kf.split(X):
    train_x = X[train_idx]
    train_y = Y[train_idx]
    test_x = X[test_idx]
    test_y = Y[test_idx]

    forest = RandomForestClassifier(random_state=13579)
    forest.fit(train_x, train_y)
    
    train_acc += [accuracy_score(train_y, forest.predict(train_x))]
    test_acc += [accuracy_score(test_y, forest.predict(test_x))]

print("Train accuracy is: " + str(np.mean(train_acc)))
print("Test accuracy is: " + str(np.mean(test_acc)))

Train accuracy is: 0.986381439865
Test accuracy is: 0.767817611067


In [41]:
# Gradient boosting

np.random.seed(seed=13579)

kf = KFold(n_splits=10, random_state=13579)

train_acc = []
test_acc = []

for train_idx, test_idx in kf.split(X):
    train_x = X[train_idx]
    train_y = Y[train_idx]
    test_x = X[test_idx]
    test_y = Y[test_idx]

    boost = GradientBoostingClassifier(random_state=13579)
    boost.fit(train_x, train_y)
    
    train_acc += [accuracy_score(train_y, boost.predict(train_x))]
    test_acc += [accuracy_score(test_y, boost.predict(test_x))]

print("Train accuracy is: " + str(np.mean(train_acc)))
print("Test accuracy is: " + str(np.mean(test_acc)))

Train accuracy is: 0.786088777452
Test accuracy is: 0.782369886097


In [44]:
# Multi-layer perceptron

np.random.seed(seed=13579)

kf = KFold(n_splits=10, random_state=13579)

train_acc = []
test_acc = []

for train_idx, test_idx in kf.split(X):
    train_x = X[train_idx]
    train_y = Y[train_idx]
    test_x = X[test_idx]
    test_y = Y[test_idx]

    tron = MLPClassifier(random_state=13579)
    tron.fit(train_x, train_y)
    
    train_acc += [accuracy_score(train_y, tron.predict(train_x))]
    test_acc += [accuracy_score(test_y, tron.predict(test_x))]

print("Train accuracy is: " + str(np.mean(train_acc)))
print("Test accuracy is: " + str(np.mean(test_acc)))

Train accuracy is: 0.613954941483
Test accuracy is: 0.614787117408


In [48]:
# Ensemble learner

np.random.seed(seed=13579)

kf = KFold(n_splits=10, random_state=13579)

logit = LogisticRegression()
tree = DecisionTreeClassifier(random_state=13579)
lasso = LogisticRegression(penalty = 'l1')
knn = KNeighborsClassifier()
forest = RandomForestClassifier(random_state=13579)
tron = MLPClassifier(random_state=13579)
boost = GradientBoostingClassifier(random_state=13579)

mv_model = VotingClassifier([('logit', logit), 
                             ('tree', tree), 
                             ('lasso', lasso),
                             ('knn', knn),
                             ('forest', forest),
                             ('tron', tron),
                             ('boost', boost)], voting='soft')

mv_model_train_acc = []
mv_model_test_acc = []

for train_idx, test_idx in kf.split(X):
    train_x = X[train_idx]
    train_y = Y[train_idx]
    test_x = X[test_idx]
    test_y = Y[test_idx]
    
    mv_model.fit(train_x, train_y)
    
    mv_model_train_acc += [accuracy_score(train_y, mv_model.predict(train_x))]
    mv_model_test_acc += [accuracy_score(test_y, mv_model.predict(test_x))]
    
print(np.mean(mv_model_train_acc))
print(np.mean(mv_model_test_acc))

0.932332983086
0.759546987312


In [51]:
# Gradient boosting (using multiple metrics)

np.random.seed(seed=13579)

kf = KFold(n_splits=10, random_state=13579)

test_acc = []
test_precision = []
test_recall = []
test_roc_auc = []

for train_idx, test_idx in kf.split(X):
    train_x = X[train_idx]
    train_y = Y[train_idx]
    test_x = X[test_idx]
    test_y = Y[test_idx]

    boost = GradientBoostingClassifier(random_state=13579)
    boost.fit(train_x, train_y)
    
    y_pred_test = boost.predict(test_x)
    
    test_acc += [accuracy_score(test_y, y_pred_test)]
    test_precision += [precision_score(test_y, y_pred_test)]
    test_recall += [recall_score(test_y, y_pred_test)]
    test_roc_auc += [roc_auc_score(test_y, y_pred_test)]

print("Test accuracy is: " + str(np.mean(test_acc)))
print("Test precision is: " + str(np.mean(test_precision)))
print("Test recall is: " + str(np.mean(test_recall)))
print("Test ROC AUC is: " + str(np.mean(test_roc_auc)))

Test accuracy is: 0.782369886097
Test precision is: 0.782130184309
Test recall is: 0.777197302401
Test ROC AUC is: 0.782309743378


In [None]:
# TBD LATER IF HAVE EXTRA TIME

# - Explore class weights
# - Explore different algorithms