## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

# Get data and explore


## first_time_readers

In [2]:
first_time_readers = pd.read_csv('first_time_readers.csv', delimiter = ';', names=['my_date', 'event_type', 'country', 'user_id', 'source', 'continent'])

In [3]:
first_time_readers.head()

Unnamed: 0,my_date,event_type,country,user_id,source,continent
0,2018-01-01 00:01:01,read,country_7,2458151261,SEO,North America
1,2018-01-01 00:03:20,read,country_7,2458151262,SEO,South America
2,2018-01-01 00:04:01,read,country_7,2458151263,AdWords,Africa
3,2018-01-01 00:04:02,read,country_7,2458151264,AdWords,Europe
4,2018-01-01 00:05:03,read,country_8,2458151265,Reddit,North America


In [4]:
first_time_readers.shape

(210023, 6)

In [5]:
first_time_readers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210023 entries, 0 to 210022
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   my_date     210023 non-null  object
 1   event_type  210023 non-null  object
 2   country     210023 non-null  object
 3   user_id     210023 non-null  int64 
 4   source      210023 non-null  object
 5   continent   210023 non-null  object
dtypes: int64(1), object(5)
memory usage: 9.6+ MB


In [6]:
first_time_readers.source.value_counts()

Reddit     105216
AdWords     63065
SEO         41742
Name: source, dtype: int64

In [7]:
first_time_readers.country.value_counts()

country_7    51791
country_2    50675
country_5    40349
country_6    31156
country_8    15483
country_4    12751
country_1     5076
country_3     2742
Name: country, dtype: int64

In [8]:
first_time_readers.continent.value_counts()

Asia             76092
Europe           39561
North America    37567
South America    25095
Africa           15913
Australia        15795
Name: continent, dtype: int64

## returning_readers

In [9]:
returning_readers = pd.read_csv('returning_readers.csv', delimiter = ';', names=['my_date', 'event_type', 'country', 'user_id', 'continent'])

In [10]:
returning_readers.head()

Unnamed: 0,my_date,event_type,country,user_id,continent
0,2018-01-01 02:20:18,read,country_5,2458151287,North America
1,2018-01-01 02:53:39,read,country_4,2458151279,Asia
2,2018-01-01 03:34:22,read,country_4,2458151525,Africa
3,2018-01-01 03:36:18,read,country_5,2458151287,Asia
4,2018-01-01 03:40:42,read,country_5,2458151315,South America


In [11]:
returning_readers.shape

(371854, 5)

In [12]:
returning_readers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371854 entries, 0 to 371853
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   my_date     371854 non-null  object
 1   event_type  371854 non-null  object
 2   country     371854 non-null  object
 3   user_id     371854 non-null  int64 
 4   continent   371854 non-null  object
dtypes: int64(1), object(4)
memory usage: 14.2+ MB


In [13]:
returning_readers.country.value_counts()

country_5    109383
country_7     80276
country_2     79401
country_4     58507
country_6     32596
country_3      6386
country_8      4150
country_1      1155
Name: country, dtype: int64

In [14]:
returning_readers.continent.value_counts()

Asia             118833
North America     92767
Europe            54136
South America     51706
Africa            27481
Australia         26931
Name: continent, dtype: int64

In [15]:
returning_readers.user_id.value_counts() # its interesting

2458174315    57
2458203397    51
2458280511    51
2458197594    51
2458284001    51
              ..
2458359466     1
2458164754     1
2458326226     1
2458358978     1
2458201594     1
Name: user_id, Length: 66231, dtype: int64

## subscribers

In [16]:
subscribers = pd.read_csv('subscribers.csv', delimiter = ';', names=['my_date', 'event_type', 'user_id'])

In [17]:
subscribers.head()

Unnamed: 0,my_date,event_type,user_id
0,2018-01-01 00:07:41,subscribe,2458151268
1,2018-01-01 00:07:44,subscribe,2458151267
2,2018-01-01 00:40:55,subscribe,2458151309
3,2018-01-01 01:26:26,subscribe,2458151358
4,2018-01-01 01:30:39,subscribe,2458151361


In [18]:
subscribers.shape

(7618, 3)

In [19]:
subscribers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7618 entries, 0 to 7617
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   my_date     7618 non-null   object
 1   event_type  7618 non-null   object
 2   user_id     7618 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 178.7+ KB


## purchases

In [20]:
purchases = pd.read_csv('purchases.csv', delimiter = ';', names=['my_date', 'event_type', 'user_id', 'paid'])

In [21]:
purchases.head()

Unnamed: 0,my_date,event_type,user_id,paid
0,2018-01-01 04:04:59,buy,2458151555,8
1,2018-01-01 09:28:00,buy,2458151933,8
2,2018-01-01 13:23:16,buy,2458152245,8
3,2018-01-01 14:20:43,buy,2458152315,8
4,2018-01-01 16:56:04,buy,2458152371,8


In [22]:
purchases.shape

(8407, 4)

In [23]:
purchases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8407 entries, 0 to 8406
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   my_date     8407 non-null   object
 1   event_type  8407 non-null   object
 2   user_id     8407 non-null   int64 
 3   paid        8407 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 262.8+ KB


In [24]:
purchases.paid.describe()

count    8407.000000
mean       23.133103
std        29.337263
min         8.000000
25%         8.000000
50%         8.000000
75%         8.000000
max        80.000000
Name: paid, dtype: float64

In [25]:
purchases.paid.sum()

194480

In [26]:
purchases.user_id.value_counts()

2458251270    2
2458189638    2
2458206125    2
2458194775    2
2458319117    2
             ..
2458249128    1
2458341293    1
2458304877    1
2458189747    1
2458292693    1
Name: user_id, Length: 6648, dtype: int64

In [27]:
purchases.paid.value_counts() # 6640 is 79 % 7 53000 dollars, 1767 is 21 % = 141360 dollars

8     6640
80    1767
Name: paid, dtype: int64