In [1]:
# Import necessary libraries

import pandas as pd

import numpy as np

import time

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score

from sklearn.model_selection import KFold, train_test_split

from sklearn.utils import shuffle

In [2]:
# Read in data

t0 = time.time()
df = pd.read_csv('aggregate-20160501.csv')
t1 = time.time()
print(str((t1-t0)/60) + " minutes")

6.669565852483114 minutes


In [3]:
#df.head()
#df.shape
#df.columns
#set(df.num_edits)

In [4]:
# Calculate percentage of unedited articles in original imbalanced data

len(df["num_edits"][df["num_edits"] == 0])/len(df["num_edits"])*100

99.91720181519122

In [5]:
# Check missingness in original imbalanced data

print(df.isnull().sum())

article_name                   1
num_edits                      0
views_30d                      0
views_7d                       0
views_3d                       0
views_1d                       0
edits_30d                      0
edits_7d                       0
edits_3d                       0
edits_1d                       0
minor_edits_30d                0
minor_edits_7d                 0
minor_edits_3d                 0
minor_edits_1d                 0
avg_size_30d             6428583
avg_size_7d             45268487
avg_size_3d             49800824
avg_size_1d             53028939
avg_size                  209968
latest_size               209968
talk_views_30d                 0
talk_views_7d                  0
talk_views_3d                  0
talk_views_1d                  0
talk_edits_30d                 0
talk_edits_7d                  0
talk_edits_3d                  0
talk_edits_1d                  0
talk_minor_edits_30d           0
talk_minor_edits_7d            0
talk_minor

In [6]:
# Generate binary edited vs. unedited variable

df['num_edits_binary'] = df['num_edits'].apply(lambda x: int(x > 0))
#set(df['num_edits_binary'])

In [7]:
# Store relevant variable names in a list

feature_names = [x for x in df.columns if 'num_edits' not in x and 'article_name' not in x]

label_name = "num_edits_binary"

feature_names

['views_30d',
 'views_7d',
 'views_3d',
 'views_1d',
 'edits_30d',
 'edits_7d',
 'edits_3d',
 'edits_1d',
 'minor_edits_30d',
 'minor_edits_7d',
 'minor_edits_3d',
 'minor_edits_1d',
 'avg_size_30d',
 'avg_size_7d',
 'avg_size_3d',
 'avg_size_1d',
 'avg_size',
 'latest_size',
 'talk_views_30d',
 'talk_views_7d',
 'talk_views_3d',
 'talk_views_1d',
 'talk_edits_30d',
 'talk_edits_7d',
 'talk_edits_3d',
 'talk_edits_1d',
 'talk_minor_edits_30d',
 'talk_minor_edits_7d',
 'talk_minor_edits_3d',
 'talk_minor_edits_1d',
 'talk_avg_size_30d',
 'talk_avg_size_7d',
 'talk_avg_size_3d',
 'talk_avg_size_1d',
 'talk_avg_size',
 'talk_latest_size']

In [8]:
## Generate balanced sample

t0 = time.time()

np.random.seed(seed=13579)

# to be set aside as imbalanced test data
set1_idx = np.random.choice(range(len(df)), int(len(df) * .5), replace=False)
set1_X = df.loc[set1_idx, feature_names]
set1_Y = df.loc[set1_idx, label_name]

# to be used to generate balanced data for main analysis
set2_idx = list(set(range(len(df))) - set(list(set1_idx)))
set2 = df.loc[set2_idx, list(df.columns.tolist())]

edited = set2[set2.num_edits > 0.0].copy(deep=True)
#edited.shape

not_edited = set2[set2.num_edits == 0.0].copy(deep=True)

not_edited_selected = not_edited[0:edited.shape[0]]

balanced_set = pd.concat([edited, not_edited_selected])

balanced_set = shuffle(balanced_set)
#balanced_set.shape

t1 = time.time()
print(str((t1-t0)/60) + " minutes")

5.176140817006429 minutes


In [9]:
print(edited.shape)
print(balanced_set.shape)

(25949, 39)
(51898, 39)


In [14]:
# Write balanced data to csv

balanced_set.to_csv('balanced_set_20160501.csv', sep=',')
#balanced_set.head()

In [90]:
# Read in balanced data

df_b = pd.read_csv('balanced_set_20160501.csv')
df_b = df_b.set_index('Unnamed: 0')
df_b.index.name = None
#df_b.head()

In [91]:
# Calculate percentage of unedited articles in balanced data

len(df_b["num_edits_binary"][df_b["num_edits_binary"] == 0])/len(df_b["num_edits_binary"])*100

50.0

In [92]:
# Check missingness in balanced data

print(df_b.isnull().sum())

article_name                0
num_edits                   0
views_30d                   0
views_7d                    0
views_3d                    0
views_1d                    0
edits_30d                   0
edits_7d                    0
edits_3d                    0
edits_1d                    0
minor_edits_30d             0
minor_edits_7d              0
minor_edits_3d              0
minor_edits_1d              0
avg_size_30d              331
avg_size_7d              4693
avg_size_3d              6052
avg_size_1d              6771
avg_size                    0
latest_size                 0
talk_views_30d              0
talk_views_7d               0
talk_views_3d               0
talk_views_1d               0
talk_edits_30d              0
talk_edits_7d               0
talk_edits_3d               0
talk_edits_1d               0
talk_minor_edits_30d        0
talk_minor_edits_7d         0
talk_minor_edits_3d         0
talk_minor_edits_1d         0
talk_avg_size_30d       11189
talk_avg_s

In [93]:
# Generate features

# Views

df_b['total_views'] = df_b.views_1d + df_b.views_3d + df_b.views_7d + df_b.views_30d

# Version 1

df_b['change_in_views_v1_7_30'] = df_b.views_7d/4 - df_b.views_30d/23

df_b['change_in_views_v1_3_7'] = df_b.views_3d/2 - df_b.views_7d/4 

df_b['change_in_views_v1_1_3'] = df_b.views_1d/1 - df_b.views_3d/2

# Version 2

df_b['change_in_views_v2_7_30'] = ((df_b.views_7d/4)+1)/((df_b.views_30d/23)+1)

df_b['change_in_views_v2_3_7'] = ((df_b.views_3d/2)+1)/((df_b.views_7d/4)+1)

df_b['change_in_views_v2_1_3'] = ((df_b.views_1d/1)+1)/((df_b.views_3d/2)+1)

In [94]:
df_b['total_views'].describe()

count    5.189800e+04
mean     3.589764e+03
std      1.925360e+04
min      1.000000e+00
25%      8.300000e+01
50%      2.720000e+02
75%      1.185000e+03
max      1.405372e+06
Name: total_views, dtype: float64

In [95]:
# Edits

df_b['total_edits'] = df_b.edits_1d + df_b.edits_3d + df_b.edits_7d + df_b.edits_30d

# Version 1

df_b['change_in_edits_v1_7_30'] = df_b.edits_7d/4 - df_b.edits_30d/23

df_b['change_in_edits_v1_3_7'] = df_b.edits_3d/2 - df_b.edits_7d/4

df_b['change_in_edits_v1_1_3'] = df_b.edits_1d/1 - df_b.edits_3d/2

# Version 2

df_b['change_in_edits_v2_7_30'] = ((df_b.edits_7d/4)+1)/((df_b.edits_30d/23)+1)

df_b['change_in_edits_v2_3_7'] = ((df_b.edits_3d/2)+1)/((df_b.edits_7d/4)+1)

df_b['change_in_edits_v2_1_3'] = ((df_b.edits_1d/1)+1)/((df_b.edits_3d/2)+1)

In [96]:
df_b['total_edits'].describe()

count    51898.000000
mean         3.768411
std         23.809639
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max       3425.000000
Name: total_edits, dtype: float64

In [97]:
# Minor edits

df_b['total_minor_edits'] = df_b.minor_edits_1d + df_b.minor_edits_3d + df_b.minor_edits_7d + df_b.minor_edits_30d

# Version 1

df_b['change_in_minor_edits_v1_7_30'] = df_b.minor_edits_7d/4 - df_b.minor_edits_30d/23

df_b['change_in_minor_edits_v1_3_7'] = df_b.minor_edits_3d/2 - df_b.minor_edits_7d/4

df_b['change_in_minor_edits_v1_1_3'] = df_b.minor_edits_1d/1 - df_b.minor_edits_3d/2

# Version 2

df_b['change_in_minor_edits_v2_7_30'] = ((df_b.minor_edits_7d/4)+1)/((df_b.minor_edits_30d/23)+1)

df_b['change_in_minor_edits_v2_3_7'] = ((df_b.minor_edits_3d/2)+1)/((df_b.minor_edits_7d/4)+1)

df_b['change_in_minor_edits_v2_1_3'] = ((df_b.minor_edits_1d/1)+1)/((df_b.minor_edits_3d/2)+1)

In [98]:
df_b['total_minor_edits'].describe()

count    51898.000000
mean         0.965240
std          8.108387
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max       1422.000000
Name: total_minor_edits, dtype: float64

In [None]:
# Average size

# Version 1

df_b['change_in_avg_size_v1_7_30'] = df_b[['avg_size_7d', 'avg_size_30d', 'edits_7d', 'edits_30d']].apply(
    lambda x: 0
    if (x['edits_7d'] + x['edits_30d'] == 0)
    else (x['avg_size_7d']/4 - x['avg_size_30d']/23), 
    axis=1
)

df_b['change_in_avg_size_v1_3_7'] = df_b[['avg_size_3d', 'avg_size_7d', 'edits_3d', 'edits_7d']].apply(
    lambda x: 0
    if (x['edits_3d'] + x['edits_7d'] == 0)
    else (x['avg_size_3d']/2 - x['avg_size_7d']/4), 
    axis=1
)

df_b['change_in_avg_size_v1_1_3'] = df_b[['avg_size_1d', 'avg_size_3d', 'edits_1d', 'edits_3d']].apply(
    lambda x: 0
    if (x['edits_1d'] + x['edits_3d'] == 0)
    else (x['avg_size_1d']/1 - x['avg_size_3d']/2), 
    axis=1
)

# Version 2

df_b['change_in_avg_size_v2_7_30'] = df_b[['avg_size_7d', 'avg_size_30d', 'edits_7d', 'edits_30d']].apply(
    lambda x: 0
    if (x['edits_7d'] + x['edits_30d'] == 0)
    else (((x['avg_size_7d']/4)+1)/((x['avg_size_30d']/23)+1)), 
    axis=1
)

df_b['change_in_avg_size_v2_3_7'] = df_b[['avg_size_3d', 'avg_size_7d', 'edits_3d', 'edits_7d']].apply(
    lambda x: 0
    if (x['edits_3d'] + x['edits_7d'] == 0)
    else (((x['avg_size_3d']/2)+1)/((x['avg_size_7d']/4)+1)), 
    axis=1
)

df_b['change_in_avg_size_v2_1_3'] = df_b[['avg_size_1d', 'avg_size_3d', 'edits_1d', 'edits_3d']].apply(
    lambda x: 0
    if (x['edits_1d'] + x['edits_3d'] == 0)
    else (((x['avg_size_1d']/1)+1)/((x['avg_size_3d']/2)+1)), 
    axis=1
)

In [109]:
df_b['change_in_avg_size_v2_1_3'].describe()

count    51622.000000
mean         1.882334
std        217.397241
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      39456.000000
Name: change_in_avg_size_v2_1_3, dtype: float64

In [100]:
# Talk views

df_b['total_talk_views'] = df_b.talk_views_1d + df_b.talk_views_3d + df_b.talk_views_7d + df_b.talk_views_30d

# Version 1

df_b['change_in_talk_views_v1_7_30'] = df_b.talk_views_7d/4 - df_b.talk_views_30d/23

df_b['change_in_talk_views_v1_3_7'] = df_b.talk_views_3d/2 - df_b.talk_views_7d/4 

df_b['change_in_talk_views_v1_1_3'] = df_b.talk_views_1d/1 - df_b.talk_views_3d/2

# Version 2

df_b['change_in_talk_views_v2_7_30'] = ((df_b.talk_views_7d/4)+1)/((df_b.talk_views_30d/23)+1)

df_b['change_in_talk_views_v2_3_7'] = ((df_b.talk_views_3d/2)+1)/((df_b.talk_views_7d/4)+1)

df_b['change_in_talk_views_v2_1_3'] = ((df_b.talk_views_1d/1)+1)/((df_b.talk_views_3d/2)+1)

In [101]:
df_b['total_talk_views'].describe()

count    51898.000000
mean        13.475625
std         50.183817
min          0.000000
25%          3.000000
50%          7.000000
75%         13.000000
max       7599.000000
Name: total_talk_views, dtype: float64

In [102]:
# Talk edits

df_b['total_talk_edits'] = df_b.talk_edits_1d + df_b.talk_edits_3d + df_b.talk_edits_7d + df_b.talk_edits_30d

# Version 1

df_b['change_in_talk_edits_v1_7_30'] = df_b.talk_edits_7d/4 - df_b.talk_edits_30d/23

df_b['change_in_talk_edits_v1_3_7'] = df_b.talk_edits_3d/2 - df_b.talk_edits_7d/4

df_b['change_in_talk_edits_v1_1_3'] = df_b.talk_edits_1d/1 - df_b.talk_edits_3d/2

# Version 2

df_b['change_in_talk_edits_v2_7_30'] = ((df_b.talk_edits_7d/4)+1)/((df_b.talk_edits_30d/23)+1)

df_b['change_in_talk_edits_v2_3_7'] = ((df_b.talk_edits_3d/2)+1)/((df_b.talk_edits_7d/4)+1)

df_b['change_in_talk_edits_v2_1_3'] = ((df_b.talk_edits_1d/1)+1)/((df_b.talk_edits_3d/2)+1)

In [103]:
df_b['total_talk_edits'].describe()

count    51898.000000
mean         0.233593
std          5.281802
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max        760.000000
Name: total_talk_edits, dtype: float64

In [106]:
# Talk minor edits

df_b['total_talk_minor_edits'] = (df_b.talk_minor_edits_1d + df_b.talk_minor_edits_3d + 
                                  df_b.talk_minor_edits_7d + df_b.talk_minor_edits_30d)


# Version 1

df_b['change_in_talk_minor_edits_v1_7_30'] = df_b.talk_minor_edits_7d/4 - df_b.talk_minor_edits_30d/23

df_b['change_in_talk_minor_edits_v1_3_7'] = df_b.talk_minor_edits_3d/2 - df_b.talk_minor_edits_7d/4

df_b['change_in_talk_minor_edits_v1_1_3'] = df_b.talk_minor_edits_1d/1 - df_b.talk_minor_edits_3d/2

# Version 2

df_b['change_in_talk_minor_edits_v2_7_30'] = ((df_b.talk_minor_edits_7d/4)+1)/((df_b.talk_minor_edits_30d/23)+1)

df_b['change_in_talk_minor_edits_v2_3_7'] = ((df_b.talk_minor_edits_3d/2)+1)/((df_b.talk_minor_edits_7d/4)+1)

df_b['change_in_talk_minor_edits_v2_1_3'] = ((df_b.talk_minor_edits_1d/1)+1)/((df_b.talk_minor_edits_3d/2)+1)

In [107]:
df_b['total_talk_minor_edits'].describe()

count    51898.000000
mean         0.031446
std          0.426782
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         50.000000
Name: total_talk_minor_edits, dtype: float64

In [108]:
# Talk average size

# Version 1

df_b['change_in_talk_avg_size_v1_7_30'] = df_b[['talk_avg_size_7d', 'talk_avg_size_30d', 
                                                'talk_edits_7d', 'talk_edits_30d']].apply(
    lambda x: 0
    if (x['talk_edits_7d'] + x['talk_edits_30d'] == 0)
    else (x['talk_avg_size_7d']/4 - x['talk_avg_size_30d']/23), 
    axis=1
)

df_b['change_in_talk_avg_size_v1_3_7'] = df_b[['talk_avg_size_3d', 'talk_avg_size_7d', 
                                               'talk_edits_3d', 'talk_edits_7d']].apply(
    lambda x: 0
    if (x['talk_edits_3d'] + x['talk_edits_7d'] == 0)
    else (x['talk_avg_size_3d']/2 - x['talk_avg_size_7d']/4), 
    axis=1
)

df_b['change_in_talk_avg_size_v1_1_3'] = df_b[['talk_avg_size_1d', 'talk_avg_size_3d', 
                                               'talk_edits_1d', 'talk_edits_3d']].apply(
    lambda x: 0
    if (x['talk_edits_1d'] + x['talk_edits_3d'] == 0)
    else (x['talk_avg_size_1d']/1 - x['talk_avg_size_3d']/2), 
    axis=1
)

# Version 2

df_b['change_in_talk_avg_size_v2_7_30'] = df_b[['talk_avg_size_7d', 'talk_avg_size_30d', 
                                                'talk_edits_7d', 'talk_edits_30d']].apply(
    lambda x: 0
    if (x['talk_edits_7d'] + x['talk_edits_30d'] == 0)
    else (((x['talk_avg_size_7d']/4)+1)/((x['talk_avg_size_30d']/23)+1)), 
    axis=1
)

df_b['change_in_talk_avg_size_v2_3_7'] = df_b[['talk_avg_size_3d', 'talk_avg_size_7d', 
                                               'talk_edits_3d', 'talk_edits_7d']].apply(
    lambda x: 0
    if (x['talk_edits_3d'] + x['talk_edits_7d'] == 0)
    else (((x['talk_avg_size_3d']/2)+1)/((x['talk_avg_size_7d']/4)+1)), 
    axis=1
)

df_b['change_in_talk_avg_size_v2_1_3'] = df_b[['talk_avg_size_1d', 'talk_avg_size_3d', 
                                               'talk_edits_1d', 'talk_edits_3d']].apply(
    lambda x: 0
    if (x['talk_edits_1d'] + x['talk_edits_3d'] == 0)
    else (((x['talk_avg_size_1d']/1)+1)/((x['talk_avg_size_3d']/2)+1)), 
    axis=1
)

In [110]:
df_b['change_in_talk_avg_size_v2_1_3'].describe()

count    51656.000000
mean         0.017919
std          0.289071
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         25.786123
Name: change_in_talk_avg_size_v2_1_3, dtype: float64

In [111]:
# Check missingness in balanced data with new features

print(df_b.isnull().sum())

article_name                             0
num_edits                                0
views_30d                                0
views_7d                                 0
views_3d                                 0
views_1d                                 0
edits_30d                                0
edits_7d                                 0
edits_3d                                 0
edits_1d                                 0
minor_edits_30d                          0
minor_edits_7d                           0
minor_edits_3d                           0
minor_edits_1d                           0
avg_size_30d                           331
avg_size_7d                           4693
avg_size_3d                           6052
avg_size_1d                           6771
avg_size                                 0
latest_size                              0
talk_views_30d                           0
talk_views_7d                            0
talk_views_3d                            0
talk_views_

In [112]:
df_b.head()

Unnamed: 0,article_name,num_edits,views_30d,views_7d,views_3d,views_1d,edits_30d,edits_7d,edits_3d,edits_1d,...,change_in_talk_minor_edits_v1_1_3,change_in_talk_minor_edits_v2_7_30,change_in_talk_minor_edits_v2_3_7,change_in_talk_minor_edits_v2_1_3,change_in_talk_avg_size_v1_7_30,change_in_talk_avg_size_v1_3_7,change_in_talk_avg_size_v1_1_3,change_in_talk_avg_size_v2_7_30,change_in_talk_avg_size_v2_3_7,change_in_talk_avg_size_v2_1_3
24625461,Jano_Toussounian,3.0,125.0,11.0,10.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
48635,2014%E2%80%9315_Acad%C3%A9mico/83_season,0.0,67.0,13.0,6.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
19619262,Dickin_Medal,1.0,13456.0,855.0,496.0,182.0,9.0,0.0,0.0,0.0,...,0.0,1.25,0.8,1.0,8768.416149,422.5,0.0,5.86706,1.039972,0.0
383982,Arsaber,1.0,106.0,13.0,8.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
48469130,Rajkumar_Singhajit_Singh,1.0,306.0,54.0,27.0,3.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
# Write balanced data (with new features) to csv

df_b.to_csv('balanced_set2_20160501.csv', sep=',')
#df_b.head()

In [114]:
# Read in balanced data (with new features)

df_b2 = pd.read_csv('balanced_set2_20160501.csv')
df_b2 = df_b2.set_index('Unnamed: 0')
df_b2.index.name = None
# df_b.head()

In [115]:
df_b2_ri = df_b2.reset_index()
df_b2_ri = df_b2_ri.rename(columns={'index': 'original_index'})
df_b2_ri.shape

(51898, 94)

In [116]:
# Store relevant variable names (including new features) in a list

new_feature_names = [x for x in df_b2_ri.columns 
                     if 'original_index' not in x
                     and 'article_name' not in x
                     and 'num_edits' not in x
                     and 'views_30d' not in x
                     and 'views_7d' not in x
                     and 'views_3d' not in x
                     and 'views_1d' not in x
                     and 'edits_30d' not in x
                     and 'edits_7d' not in x
                     and 'edits_3d' not in x
                     and 'edits_1d' not in x
                     and 'minor_edits_30d' not in x
                     and 'minor_edits_7d' not in x
                     and 'minor_edits_3d' not in x
                     and 'minor_edits_1d' not in x
                     and 'avg_size_30d' not in x
                     and 'avg_size_7d' not in x
                     and 'avg_size_3d' not in x
                     and 'avg_size_1d' not in x
                     and 'talk_views_30d' not in x
                     and 'talk_views_7d' not in x
                     and 'talk_views_3d' not in x
                     and 'talk_views_1d' not in x
                     and 'talk_edits_30d' not in x
                     and 'talk_edits_7d' not in x
                     and 'talk_edits_3d' not in x
                     and 'talk_edits_1d' not in x
                     and 'talk_minor_edits_30d' not in x
                     and 'talk_minor_edits_7d' not in x
                     and 'talk_minor_edits_3d' not in x
                     and 'talk_minor_edits_1d' not in x
                     and 'talk_avg_size_30d' not in x
                     and 'talk_avg_size_7d' not in x
                     and 'talk_avg_size_3d' not in x
                     and 'talk_avg_size_1d' not in x
                     and 'talk_avg_size' not in x
                     and 'talk_latest_size' not in x]

print(len(new_feature_names))

label_name = "num_edits_binary"

50


In [117]:
# Drop rows with NaN's 

df_b2_ri = df_b2_ri[new_feature_names + [label_name]]
df_b2_ri = df_b2_ri.dropna() 

print(df_b2_ri.shape)

df_b2_ri.isnull().sum()

(51382, 51)


avg_size                              0
latest_size                           0
total_views                           0
change_in_views_v1_7_30               0
change_in_views_v1_3_7                0
change_in_views_v1_1_3                0
change_in_views_v2_7_30               0
change_in_views_v2_3_7                0
change_in_views_v2_1_3                0
total_edits                           0
change_in_edits_v1_7_30               0
change_in_edits_v1_3_7                0
change_in_edits_v1_1_3                0
change_in_edits_v2_7_30               0
change_in_edits_v2_3_7                0
change_in_edits_v2_1_3                0
total_minor_edits                     0
change_in_minor_edits_v1_7_30         0
change_in_minor_edits_v1_3_7          0
change_in_minor_edits_v1_1_3          0
change_in_minor_edits_v2_7_30         0
change_in_minor_edits_v2_3_7          0
change_in_minor_edits_v2_1_3          0
change_in_avg_size_v1_7_30            0
change_in_avg_size_v1_3_7             0


In [118]:
print(len(df_b2_ri[df_b2_ri.num_edits_binary == 0])) # number of unedited articles
print(len(df_b2_ri[df_b2_ri.num_edits_binary != 0])) # number of edited articles

25942
25440


In [119]:
# Convert data frame to matrix

X = df_b2_ri[new_feature_names].as_matrix()

Y = df_b2_ri[label_name].as_matrix()

In [121]:
# Logistic regression

np.random.seed(seed=13579)

kf = KFold(n_splits=10, random_state=13579)

train_acc = []
test_acc = []

for train_idx, test_idx in kf.split(X):
    train_x = X[train_idx]
    train_y = Y[train_idx]
    test_x = X[test_idx]
    test_y = Y[test_idx]

    model = LogisticRegression()
    model.fit(train_x, train_y)
    
    train_acc += [accuracy_score(train_y, model.predict(train_x))]
    test_acc += [accuracy_score(test_y, model.predict(test_x))]

print("Train accuracy is: " + str(np.mean(train_acc)))
print("Test accuracy is: " + str(np.mean(test_acc)))

Train accuracy is: 0.68930753426
Test accuracy is: 0.689540986348


In [124]:
# Correlation matrix

corr_table = df_b2_ri[[label_name] + new_feature_names].corr(method='pearson')
corr_table.style.background_gradient(cmap='RdYlGn', axis = 0)

Unnamed: 0,num_edits_binary,avg_size,latest_size,total_views,change_in_views_v1_7_30,change_in_views_v1_3_7,change_in_views_v1_1_3,change_in_views_v2_7_30,change_in_views_v2_3_7,change_in_views_v2_1_3,total_edits,change_in_edits_v1_7_30,change_in_edits_v1_3_7,change_in_edits_v1_1_3,change_in_edits_v2_7_30,change_in_edits_v2_3_7,change_in_edits_v2_1_3,total_minor_edits,change_in_minor_edits_v1_7_30,change_in_minor_edits_v1_3_7,change_in_minor_edits_v1_1_3,change_in_minor_edits_v2_7_30,change_in_minor_edits_v2_3_7,change_in_minor_edits_v2_1_3,change_in_avg_size_v1_7_30,change_in_avg_size_v1_3_7,change_in_avg_size_v1_1_3,change_in_avg_size_v2_7_30,change_in_avg_size_v2_3_7,change_in_avg_size_v2_1_3,total_talk_views,change_in_talk_views_v1_7_30,change_in_talk_views_v1_3_7,change_in_talk_views_v1_1_3,change_in_talk_views_v2_7_30,change_in_talk_views_v2_3_7,change_in_talk_views_v2_1_3,total_talk_edits,change_in_talk_edits_v1_7_30,change_in_talk_edits_v1_3_7,change_in_talk_edits_v1_1_3,change_in_talk_edits_v2_7_30,change_in_talk_edits_v2_3_7,change_in_talk_edits_v2_1_3,total_talk_minor_edits,change_in_talk_minor_edits_v1_7_30,change_in_talk_minor_edits_v1_3_7,change_in_talk_minor_edits_v1_1_3,change_in_talk_minor_edits_v2_7_30,change_in_talk_minor_edits_v2_3_7,change_in_talk_minor_edits_v2_1_3
num_edits_binary,1.0,0.0507942,0.0826828,0.145285,-0.00203237,-0.00420249,-0.0583603,0.0298807,0.0260819,0.0130398,0.131936,0.0371839,0.0174869,0.0375127,0.0623681,0.0653028,0.0890131,0.0970404,0.0269559,0.0865043,-0.078285,0.0384213,0.170451,-0.106197,0.0547463,0.0771349,0.0543555,0.175816,0.00574561,0.00848077,0.118306,0.0036297,-0.00593569,-0.0139582,0.02571,-0.000236713,0.0174245,0.0347055,-0.00439297,0.0192871,-0.00072673,0.00699159,0.0280088,0.0202235,0.0648829,-0.00966539,0.010082,0.0013324,-0.00667399,0.0133011,0.0129309
avg_size,0.0507942,1.0,0.75158,0.68124,-0.229888,-0.0812255,-0.230777,-0.00110768,1.39923e-05,-0.00558356,0.138727,0.0218929,-0.020068,0.00898631,0.0392892,0.0294112,0.00489402,0.149707,0.0180065,0.140883,-0.141702,0.0508658,0.153513,-0.0891102,0.191987,0.581759,0.626884,0.0242578,-1.6259e-05,0.000190685,0.410696,-0.0411149,0.0546427,-0.00374793,0.0287263,-0.00113175,-0.00746936,0.167706,0.0161545,0.1566,0.110719,0.0786536,0.134439,0.0592978,0.231143,0.0413042,0.0626006,0.0502,0.0541136,0.0764843,0.0736667
latest_size,0.0826828,0.75158,1.0,0.783047,0.107027,-0.223232,-0.326744,0.0200562,0.0413459,0.00637063,0.235236,0.0804817,-0.0211564,0.0399879,0.106196,0.0528728,0.0355305,0.222729,0.08289,0.198123,-0.196013,0.139708,0.21497,-0.100964,0.6301,0.65209,0.816608,0.0923436,0.000257012,0.000122533,0.478641,0.00101262,0.0162527,-0.0430659,0.10325,0.011906,-0.00294116,0.216404,-0.0299792,0.156669,0.052118,0.078986,0.121441,0.0524078,0.310003,-0.00260448,0.117242,-0.00438668,0.0112698,0.112169,0.04543
total_views,0.145285,0.68124,0.783047,1.0,-0.0811068,-0.244871,-0.396136,0.0181982,0.00509069,-0.0152318,0.437699,0.0188853,-0.100983,0.0689132,0.0979071,0.0476824,0.0300714,0.440999,-0.0220606,0.427886,-0.423224,0.0976701,0.382598,-0.195777,0.492831,0.561118,0.567219,0.121181,3.99692e-06,-0.000650688,0.679313,-0.194366,-0.0342158,-0.0356121,0.0913678,0.0027779,-0.00678231,0.341974,-0.124958,0.0910928,0.0872353,0.0597745,0.109819,0.0570574,0.415647,-0.0225978,0.0453421,0.0186545,0.00395372,0.0676561,0.0649805
change_in_views_v1_7_30,-0.00203237,-0.229888,0.107027,-0.0811068,1.0,-0.688697,-0.274223,0.10793,-0.00235642,-0.00011794,-0.167456,0.336688,0.0130206,-0.0952582,0.265261,-0.00870002,-0.00221534,-0.236546,0.371229,-0.29393,0.270453,0.313178,-0.135939,0.0491236,0.672353,-0.225024,-0.0389242,0.184333,-7.79072e-06,-7.74088e-05,-0.235898,0.500058,-0.104091,-0.100796,0.297591,-0.00663099,-0.00543234,-0.187304,0.205072,0.0385161,-0.143289,0.137809,-0.00969779,-0.0486341,-0.142087,0.085393,0.10482,-0.112725,0.0713635,0.0623054,-0.0774162
change_in_views_v1_3_7,-0.00420249,-0.0812255,-0.223232,-0.244871,-0.688697,1.0,0.00564644,-0.0743495,0.150314,-0.000295371,-0.0574378,-0.231485,0.182859,-0.0147412,-0.217425,0.146312,-0.00654898,-0.0429946,-0.224019,-0.00398018,0.0297386,-0.252565,-0.00511541,0.0304816,-0.63123,0.309169,-0.163605,-0.131869,0.00297485,9.61148e-06,-0.0668446,-0.198055,0.239192,0.0397213,-0.228484,0.14014,0.00233568,-0.0315067,-0.0547928,-0.0231714,0.0555377,-0.108186,-0.00374202,0.0210055,-0.063411,-0.048901,-0.0473748,0.0441096,-0.0532196,-0.0343934,0.0094211
change_in_views_v1_1_3,-0.0583603,-0.230777,-0.326744,-0.396136,-0.274223,0.00564644,1.0,-0.0511265,-0.137914,0.127542,-0.109169,-0.133235,-0.039107,0.157752,-0.157539,-0.11922,0.0914811,-0.109922,-0.136752,-0.0822794,0.0946344,-0.175829,-0.123234,0.0902528,-0.419584,-0.449822,-0.0193096,-0.130869,-0.00291198,0.00354948,-0.265024,-0.109827,-0.0726619,0.147085,-0.186587,-0.116759,0.0521756,-0.101498,-0.0266727,-0.0878384,0.0237488,-0.107903,-0.0689217,0.0033153,-0.151514,-0.03681,-0.0947459,0.0718768,-0.0455238,-0.0849639,0.0306136
change_in_views_v2_7_30,0.0298807,-0.00110768,0.0200562,0.0181982,0.10793,-0.0743495,-0.0511265,1.0,-0.00488592,-0.000549021,0.0405275,0.198619,-0.0340465,-0.022321,0.281191,-0.0027232,0.0118365,0.0102854,0.122895,-0.0119395,0.00391337,0.146429,-0.0159859,0.0169989,0.0562637,-0.00356684,0.00457077,0.616042,-7.18329e-07,-0.000188328,0.0071826,0.064892,-0.0207853,-0.0120207,0.192124,-0.0076795,-0.000653625,0.00602954,0.0339577,-0.00701821,-0.0141082,0.0650167,-0.00920578,-0.00679071,0.00898002,0.0173702,0.0106733,-0.0136865,0.0186631,0.0104321,-0.00763964
change_in_views_v2_3_7,0.0260819,1.39923e-05,0.0413459,0.00509069,-0.00235642,0.150314,-0.137914,-0.00488592,1.0,-0.00618163,0.018239,-0.00381774,0.166706,-0.0634389,-0.00251491,0.273609,-0.0031039,0.00318828,-0.00382526,7.07167e-05,0.0172635,-0.00375754,0.00164396,0.107769,-0.00348896,0.0587625,0.0275935,-0.00495773,0.0149504,-0.000766188,0.00516586,0.000906142,0.137555,-0.0408081,0.0057261,0.221499,-0.000808044,0.00232862,0.00172513,0.0277665,-0.0132187,0.00311479,0.0410339,-0.00263229,0.000955853,0.00093997,0.00876932,-0.00267813,0.000936405,0.00956658,-0.000379697
change_in_views_v2_1_3,0.0130398,-0.00558356,0.00637063,-0.0152318,-0.00011794,-0.000295371,0.127542,-0.000549021,-0.00618163,1.0,0.00465795,-0.00362387,-0.00271984,0.181774,-0.00570466,-0.00466446,0.271399,-0.00332873,4.23516e-05,-0.00672992,0.0230577,-0.000492698,-0.0125713,0.116844,-0.00611022,-0.0092491,0.0283281,-0.00798513,-0.000991007,0.0193366,-0.0154333,-0.00050042,-0.00082175,0.0475942,-0.00478325,-0.0038207,0.0989152,-0.00158583,0.000264535,-0.00229138,0.0437172,-0.00189752,-0.00329824,0.0772013,-0.00302477,0.00153212,-0.00225676,0.0223214,0.00135744,-0.00254837,0.0282383


In [None]:
# TBD LATER IF HAVE EXTRA TIME

# - Explore class weights
# - Explore different algorithms