Import 

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp, ranksums, median_test

Define Helper functions

In [2]:
# function for getting posts at specified age in days
def get_day_df(df, day, days_column='days_passed'):
    return df[df[days_column] == day]

#function for getting arrays of treatment and experimental values 
def get_treatment_control_arrays(df, value_column='score', treatment_column='isExperimental'):
    #split dataframe into treatment and control
    experimental_index = df[treatment_column]
    treatment_df = df[experimental_index]
    control_df = df[~experimental_index]
    #create arrays of values
    treatment_array = np.array(treatment_df[value_column])
    control_array = np.array(control_df[value_column])
    return treatment_array, control_array
    

#function for calculating the number of additional votes and score for posts 
def calculate_additional_votes_and_score(df, treatment_column='isExperimental', votes_column='nVotes', comments_column='nComments'):
    #one vote is subtracted from all posts since all posts automatically start with one vote from the author 
    df['additional_votes'] = df[votes_column] - 1
    #an additional vote is subtracted from experimental posts
    experimental_index = df[treatment_column]
    df.loc[experimental_index, 'additional_votes'] = df[df[treatment_column]]['additional_votes'] - 1
    #score is the sum of additional votes and comments
    df['score'] = df['additional_votes'] + df['nComments']


Read data files

In [3]:
first_batch_file = 'first_batch_wdays.csv'
second_batch_file = 'second_batch_wdays.csv'
first_df = pd.read_csv(first_batch_file, sep = ';')
second_df = pd.read_csv(second_batch_file, sep = ';')

Calculate the number of additional votes

In [4]:
calculate_additional_votes_and_score(first_df)
calculate_additional_votes_and_score(second_df)

Control variables

In [5]:
#what values are statistical test done on
value_column = 'additional_votes'
#alternative hypothsis for rank-sum test
ranksum_alternative = 'greater'
#alternative hypothesis  for Mood's median test
ks_alternative = 'two-sided'
#Number of days after first observation to perform tests on
day = 7

Get data for day number seven after first observation

In [6]:
day_first_df  = get_day_df(first_df, day)
day_second_df = get_day_df(second_df, day)

Get treatement and control arrays for first and second batch

In [7]:
treatment_array_first, control_array_first = get_treatment_control_arrays(day_first_df, value_column=value_column)
treatment_array_second, control_array_second = get_treatment_control_arrays(day_second_df, value_column=value_column)

Calculate mean number of additional votes for the 2 batches

In [8]:
print('First batch mean  treatment: {:.1f} control: {:.1f}'.format(np.mean(treatment_array_first), np.mean(control_array_first)))
print('Second batch mean treatment: {:.1f} control: {:.1f}'.format(np.mean(treatment_array_second), np.mean(control_array_second)))

First batch mean  treatment: 79.2 control: 66.3
Second batch mean treatment: 55.6 control: 66.1


Perform Mood's median test

In [9]:
median_stat_first, median_p_first, median_med_first, _ = median_test(treatment_array_first, control_array_first)
median_stat_second, median_p_second, median_med_second, _ = median_test(treatment_array_second, control_array_second)
print('First batch  median test {:.2e} p:{:.4f}'.format(median_stat_first, median_p_first))
print('Second batch median test {:.2e} p:{:.4f}'.format(median_stat_second, median_p_second))

First batch  median test 6.27e+00 p:0.0123
Second batch median test 1.65e+01 p:0.0000


Perform rank-sum test

In [10]:
ranksum_res_first, ranksum_p_first = ranksums(treatment_array_first, control_array_first, ranksum_alternative)
ranksum_res_second, ranksum_p_second = ranksums(treatment_array_second, control_array_second, ranksum_alternative)
print('First batch  rank-sum: {:.2f} p: {:.4f}'.format(ranksum_res_first, ranksum_p_first))
print('Second batch rank-sum: {:.2f} p: {:.4f}'.format(ranksum_res_second, ranksum_p_second))

First batch  rank-sum: -8.01 p: 1.0000
Second batch rank-sum: -13.79 p: 1.0000


Perform Kolmogorov-Smirnov test

In [11]:


#Kolmogorov-Smirnoff test to check if treament and control are similar across batches
ks_res_treatment, ks_p_treatment = ks_2samp(treatment_array_first, treatment_array_second, ks_alternative)
ks_res_control, ks_p_control = ks_2samp(control_array_first, control_array_second, ks_alternative)
print('Treatment KS: {:.2f} p: {:.4f}'.format(ks_res_treatment, ks_p_treatment))
print('Control   KS: {:.2f} p: {:.4f}'.format(ks_res_control, ks_p_control))
    

Treatment KS: 0.04 p: 0.1387
Control   KS: 0.03 p: 0.4432
