# Extra Project: Logistic Regression

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm

## 1 Reading and merging the data

In [2]:
web = pd.read_csv('logs_web.csv')
web

Unnamed: 0,browser,datetime,ip_anonymized,language,message
0,chrome,2015-09-01 00:02:14,9631420eea21b0458d01f7cc598c3ac2,en,0
1,firefox,2015-09-01 00:04:25,08bd4c63e1abb0e624cde4ac1518de73,ru,2
2,firefox,2015-09-01 00:07:34,07678caba7be8bbadead98d5b530c5ff,de,1
3,chrome,2015-09-01 00:07:47,ba759e7216a6a73d55591aed389ce2ad,en,2
4,chrome,2015-09-01 00:08:41,db166a3780720484f9ca31b093c5b6f0,en,1
...,...,...,...,...,...
6433,firefox,2015-09-07 23:50:47,d2c77d1d3a091fb9ba3b2d32db4444d0,en,2
6434,chrome,2015-09-08 00:07:40,2c6727b05ab7e1eaa2e407ccc08fbde5,en,0
6435,chrome,2015-09-08 00:18:11,be2f0166a60ef72a41a6a406903241d2,cn,2
6436,chrome,2015-09-08 00:19:58,4b2a1f837fa81e90f7e43f82a18ad378,en,1


In [3]:
noti = pd.read_csv('logs_notifications.csv')
noti

Unnamed: 0,browser,datetime,ip_anonymized,language,message
0,firefox,2015-09-01 00:00:00,d588180c3e5c462692c24cffd961de43,ru,2
1,chrome,2015-09-01 00:00:00,330f85e852916008aa80feb339696b2e,cn,0
2,chrome,2015-09-01 00:00:00,b85871b1112cde31be2864927c035ae3,fr,0
3,chrome,2015-09-01 00:00:00,51178bbf1c09a9a709b719fe948b5bd2,en,2
4,firefox,2015-09-01 00:00:01,884eec370b6d6bdf20d625039b9a3e8b,en,1
...,...,...,...,...,...
499995,chrome,2015-09-07 23:59:47,610e9adf09f64640e5715b3cdc5dc651,fr,0
499996,chrome,2015-09-07 23:59:49,62143d060d661e4c4032572207af3007,fr,0
499997,chrome,2015-09-07 23:59:50,3c84701f20eeadf652e9cb9ee25677eb,en,1
499998,chrome,2015-09-07 23:59:53,0e27bc5703ed5d0fd73f22c790368449,en,2


#### Merging the data into one DataFrame

In [4]:
noti.rename(columns = {'datetime': 'datetime_noti'}, inplace = True) 
web.rename(columns = {'datetime': 'datetime_web'}, inplace = True)
logs = pd.merge(noti, web, on = ['browser', 'ip_anonymized', 'language','message'], how = 'left')
logs

Unnamed: 0,browser,datetime_noti,ip_anonymized,language,message,datetime_web
0,firefox,2015-09-01 00:00:00,d588180c3e5c462692c24cffd961de43,ru,2,
1,chrome,2015-09-01 00:00:00,330f85e852916008aa80feb339696b2e,cn,0,
2,chrome,2015-09-01 00:00:00,b85871b1112cde31be2864927c035ae3,fr,0,
3,chrome,2015-09-01 00:00:00,51178bbf1c09a9a709b719fe948b5bd2,en,2,
4,firefox,2015-09-01 00:00:01,884eec370b6d6bdf20d625039b9a3e8b,en,1,
...,...,...,...,...,...,...
499995,chrome,2015-09-07 23:59:47,610e9adf09f64640e5715b3cdc5dc651,fr,0,
499996,chrome,2015-09-07 23:59:49,62143d060d661e4c4032572207af3007,fr,0,
499997,chrome,2015-09-07 23:59:50,3c84701f20eeadf652e9cb9ee25677eb,en,1,
499998,chrome,2015-09-07 23:59:53,0e27bc5703ed5d0fd73f22c790368449,en,2,


#### Creating a column for whether users clicked on the notification

In [6]:
logs['clicked'] = np.where(logs['datetime_web'].isna(), 0, 1)
logs

Unnamed: 0,browser,datetime_noti,ip_anonymized,language,message,datetime_web,clicked
0,firefox,2015-09-01 00:00:00,d588180c3e5c462692c24cffd961de43,ru,2,,0
1,chrome,2015-09-01 00:00:00,330f85e852916008aa80feb339696b2e,cn,0,,0
2,chrome,2015-09-01 00:00:00,b85871b1112cde31be2864927c035ae3,fr,0,,0
3,chrome,2015-09-01 00:00:00,51178bbf1c09a9a709b719fe948b5bd2,en,2,,0
4,firefox,2015-09-01 00:00:01,884eec370b6d6bdf20d625039b9a3e8b,en,1,,0
...,...,...,...,...,...,...,...
499995,chrome,2015-09-07 23:59:47,610e9adf09f64640e5715b3cdc5dc651,fr,0,,0
499996,chrome,2015-09-07 23:59:49,62143d060d661e4c4032572207af3007,fr,0,,0
499997,chrome,2015-09-07 23:59:50,3c84701f20eeadf652e9cb9ee25677eb,en,1,,0
499998,chrome,2015-09-07 23:59:53,0e27bc5703ed5d0fd73f22c790368449,en,2,,0


## 2 Data Analysis

### 2.1 Performance of different translations

In [7]:
pd.crosstab(logs['language'], logs['clicked'])

clicked,0,1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
cn,33423,512
de,117882,2036
en,161285,2716
es,50963,0
fr,50500,823
ru,79563,297


In [8]:
y = logs[['clicked']]
X = pd.concat([pd.get_dummies(logs['browser'], drop_first=True), pd.get_dummies(logs['language'], drop_first=True), pd.get_dummies(logs['message'], drop_first=True)], axis=1) 
X = sm.add_constant(X)
model = sm.Logit(y, X).fit()
model.summary()

         Current function value: 0.065582
         Iterations: 35




0,1,2,3
Dep. Variable:,clicked,No. Observations:,500000.0
Model:,Logit,Df Residuals:,499991.0
Method:,MLE,Df Model:,8.0
Date:,"Tue, 24 Nov 2020",Pseudo R-squ.:,0.04071
Time:,17:52:04,Log-Likelihood:,-32791.0
converged:,False,LL-Null:,-34183.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.5031,0.050,-89.955,0.000,-4.601,-4.405
firefox,0.4495,0.025,17.657,0.000,0.400,0.499
de,0.1192,0.050,2.392,0.017,0.022,0.217
en,0.0952,0.049,1.961,0.050,3.73e-05,0.190
es,-28.5348,5.63e+04,-0.001,1.000,-1.1e+05,1.1e+05
fr,0.0629,0.057,1.109,0.268,-0.048,0.174
ru,-1.4121,0.073,-19.278,0.000,-1.556,-1.269
1,0.2617,0.031,8.341,0.000,0.200,0.323
2,0.1757,0.032,5.495,0.000,0.113,0.238


**In this model specification, we have problem of quasi separation, so one or more categories predict failure perfectly. This is because none of the Spanish translated messages were clicked on. We therefore need to drop this category from the model.**

In [9]:
logs_es = logs.loc[logs['language'] != 'es']
y = logs_es[['clicked']]
X = pd.concat([pd.get_dummies(logs_es['browser'], drop_first=True), pd.get_dummies(logs_es['language'], drop_first=True), pd.get_dummies(logs_es['message'], drop_first=True)], axis=1) 
X = sm.add_constant(X)
model = sm.Logit(y, X).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.073025
         Iterations 10


0,1,2,3
Dep. Variable:,clicked,No. Observations:,449037.0
Model:,Logit,Df Residuals:,449029.0
Method:,MLE,Df Model:,7.0
Date:,"Tue, 24 Nov 2020",Pseudo R-squ.:,0.02092
Time:,17:52:24,Log-Likelihood:,-32791.0
converged:,True,LL-Null:,-33492.0
Covariance Type:,nonrobust,LLR p-value:,1.967e-298

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.5031,0.050,-89.955,0.000,-4.601,-4.405
firefox,0.4495,0.025,17.657,0.000,0.400,0.499
de,0.1192,0.050,2.392,0.017,0.022,0.217
en,0.0952,0.049,1.961,0.050,3.73e-05,0.190
fr,0.0629,0.057,1.109,0.268,-0.048,0.174
ru,-1.4121,0.073,-19.278,0.000,-1.556,-1.269
1,0.2617,0.031,8.341,0.000,0.200,0.323
2,0.1757,0.032,5.495,0.000,0.113,0.238


Looking at the results for the other languages, notifications in the Russian language furthermore perform significantly worse.

### 2.2 Different Browsers

From the coefficient on firefox, we can see that the chances of a user clicking on the notification is significantly higher than for users of Chrome. Thus, it seems like we will get better results for Firefox users.


### 2.3 Performance of Different Messages

In [10]:
r = np.zeros_like(model.params)
r[1] = 1
r[7] = -1
T_test = model.t_test(r)
print(T_test)

# in the regression with dummies we test, if there is a difference from 0 of a dummy in comparison to basline
# this does not anwer if the other dummies are significantly different from each other; therefore t-test
# H0 is here, that difference between these betas equals zero and we want to reject that

                             Test for Constraints                             
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
c0             0.2737      0.041      6.695      0.000       0.194       0.354


We can see that the coefficients on message 1 and 2 are significantly different from each other. Additionally, since the coefficients on each of the individual variables are significant, they are also significant from the base caregory (message 0). Hence, message 1 shows the best performance to users clicking on the link after receiving the notification, followed by message 2 and then message 0.
If we send messages to all users, we should use message 1 for best performance.

**Note: We haven't done this in class yet - we will talk about hypothesis testing next week, so it should become more clear what I'm doing here!**

### Bonus: Number of Installations we can expect

In [11]:
round(np.sum(model.predict(X)) * 100 * 0.5)

319200

If we send notifications to all of the users with the same distribution and translations we have this far, according to this model we should expect a total of 319200 installations of the app.

Below, we run a quick simulation to see what would happen if we sent all of our users message 1.

In [12]:
X_fict = X
X_fict[1] = 1
X_fict[2] = 0
round(np.sum(model.predict(X_fict)) * 100 * 0.5)

355575

Here we could increase the number to about 356,000.

Analyzing the data on notifications sent out to users and who clicks on the link after receiving one, we have found a number of interesting results:
- The messages we have sent out to users differ in their performance. Overall, Message 1 shows the highest performance for users clicking on the link, followed by Message 2 and then Message 0.
- Some of our translations seem to be having an issue. None of our users, who received the notification in Spanish language clicked on the link to install the new app. For users who received the notification in Russian language, fewer than average clicked on the link.
- Users of Firefox were more likely to click on the link than users of Chrome, so it seems like the display mechanism in the Firebox browser is working better for us.
- From our current data, if we send notifications out to all of our users, we could expect roughly 319,000 installations. Keep in mind, however, that this number does not account for fixing the potential issues pointed out above. If we focus on sending out Message 1 for example, we could improve our numbers. A quick simulation of sending everyone message 1 estimated about 355,000 installations.