In [1]:
import pandas as pd 
import numpy as np
import scipy
import sklearn 
from sklearn import linear_model

In [2]:
df=pd.read_csv('final_data_3.csv')

In [3]:
def compute_correlation(column_name):
    '''Computes the correlation with attendance count and column name.'''
    return df['attendance_count'].corr(df[column_name])

Calculating pairwise correlation between columns of dataframe.

We have omitted the 'declined' column because all entries are 0, correlation calculatons are impossible


In [4]:
corr_total=compute_correlation('total_invited_count')

corr_maybe= compute_correlation('maybe_count')

corr_no_reply= compute_correlation('no_reply_count')

corr_interested_count = compute_correlation('interested_count')

In [18]:
corr_df=(corr_total, corr_maybe, corr_no_reply, corr_interested_count) 
column_title = ['Total Invited', 'Maybe', 'No Replies', 'Interested']

dictionary_correlation = dict(zip(column_title, corr_df)) # creates dictionary for dataframe

In [20]:
corr_results = pd.DataFrame(dictionary_correlation, index=[0])
corr_results.round(3) # round to 3 dp

Unnamed: 0,Interested,Maybe,No Replies,Total Invited
0,0.768,0.768,0.676,0.866


In [23]:
# we ommit maybe_count because it turns out it is exactly the same as interested_count
X = df[['interested_count','no_reply_count','total_invited_count']]
y = df[['attendance_count']]

In [24]:
import statsmodels.api as sm #using statsmodels as opposed to sklearn due to regression output

model1 = sm.OLS(y, X)
results1 = model1.fit()
print(results1.summary())

                            OLS Regression Results                            
Dep. Variable:       attendance_count   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 3.849e+30
Date:                Mon, 12 Feb 2018   Prob (F-statistic):               0.00
Time:                        14:20:10   Log-Likelihood:                 18036.
No. Observations:                 694   AIC:                        -3.607e+04
Df Residuals:                     691   BIC:                        -3.605e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
interested_count       -2.0000   1

These results seem a little counter intuitive, there is an inverse relationship between the number of interested individuals and those that have clicked going -- maybe this has to do with the fact that people who click interested for the most part do not actually go to the events, they are 'potentials' but they do not ultimately make the decision to go. It is also a little strange that the coefficients are whole numbers, not sure if this is by chance or what we should we expect/if there is something weird going on here.

In [25]:
model2 = sm.OLS(y, df[['interested_count','no_reply_count']])
results2 = model2.fit()
print(results2.summary())

                            OLS Regression Results                            
Dep. Variable:       attendance_count   R-squared:                       0.763
Model:                            OLS   Adj. R-squared:                  0.762
Method:                 Least Squares   F-statistic:                     1113.
Date:                Mon, 12 Feb 2018   Prob (F-statistic):          6.33e-217
Time:                        14:20:17   Log-Likelihood:                -4014.8
No. Observations:                 694   AIC:                             8034.
Df Residuals:                     692   BIC:                             8043.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
interested_count     0.3813      0.019  

When we exclude the total invited variable we see a positive relationship between both 'interested' and 'maybe' clickers.
This suggests that it is not the ammount of people clicking interested that is driving people to these events, rather it is 
the more broadly, how many people have been invited. 