In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
from scipy.stats import pearsonr
%matplotlib inline

In [2]:
# We now load the event logging table.
df = pd.read_csv(r"E:\Data Science\data science projects\events_log.csv\events_log.csv")

In [3]:
df

Unnamed: 0,uuid,timestamp,session_id,group,action,checkin,page_id,n_results,result_position
0,00000736167c507e8ec225bd9e71f9e5,2.016030e+13,78245c2c3fba013a,b,searchResultPage,,cbeb66d1bc1f1bc2,5.0,
1,00000c69fe345268935463abbfa5d5b3,2.016031e+13,c559c3be98dca8a4,a,searchResultPage,,eb658e8722aad674,10.0,
2,00003bfdab715ee59077a3670331b787,2.016030e+13,760bf89817ce4b08,a,checkin,30.0,f99a9fc1f7fdd21e,,
3,0000465cd7c35ad2bdeafec953e08c1a,2.016030e+13,fb905603d31b2071,a,checkin,60.0,e5626962a6939a75,,10.0
4,000050cbb4ef5b42b16c4d2cf69e6358,2.016030e+13,c2bf5e5172a892dc,a,checkin,30.0,787dd6a4c371cbf9,,
...,...,...,...,...,...,...,...,...,...
400160,ffffa98532e154efa821aec083f92674,2.016030e+13,50d4c8dce4b959c3,b,checkin,120.0,be6ad202740eef90,,2.0
400161,ffffbe4347aa5c4facc64fa4a26e73e4,2.016031e+13,056fc33556fcd8c0,b,checkin,10.0,c32a6f39204b8c83,,1.0
400162,ffffc4bd7f8e53a1a86b38874c6728b1,2.016030e+13,f8487a76f255ee62,b,searchResultPage,,b24925ad79226f74,20.0,
400163,ffffe6cffc8d510db61c3f3dce918849,2.016031e+13,daa2d7654ca30c15,b,searchResultPage,,c5d5fe35920a8833,20.0,


### convert the timestamp to datetime format as per the included documentation.

In [4]:
df["timestamp"] = pd.to_datetime(df["timestamp"], format='%Y%m%d%H%M%S' )

In [5]:
# We visualize the first few rows and check the type for each variable.
df.head(10)

Unnamed: 0,uuid,timestamp,session_id,group,action,checkin,page_id,n_results,result_position
0,00000736167c507e8ec225bd9e71f9e5,2016-03-01 10:38:42,78245c2c3fba013a,b,searchResultPage,,cbeb66d1bc1f1bc2,5.0,
1,00000c69fe345268935463abbfa5d5b3,2016-03-07 00:52:26,c559c3be98dca8a4,a,searchResultPage,,eb658e8722aad674,10.0,
2,00003bfdab715ee59077a3670331b787,2016-03-02 14:53:05,760bf89817ce4b08,a,checkin,30.0,f99a9fc1f7fdd21e,,
3,0000465cd7c35ad2bdeafec953e08c1a,2016-03-02 22:27:01,fb905603d31b2071,a,checkin,60.0,e5626962a6939a75,,10.0
4,000050cbb4ef5b42b16c4d2cf69e6358,2016-03-02 09:50:23,c2bf5e5172a892dc,a,checkin,30.0,787dd6a4c371cbf9,,
5,0000a6af2baa5af1be2431e84cb01da1,2016-03-02 11:29:45,f6840a9614c527ad,a,checkin,180.0,6fb7b9ea87012975,,
6,0000cd61e11d5371adf974703cd4f7e7,2016-03-01 05:54:57,51f4d3b6a8688e56,a,checkin,240.0,8ad97e7c85c58e80,,
7,000104fe220a5675a270420cd6d4d4ac,2016-03-02 01:28:45,485eabe5374f09e4,b,searchResultPage,,4da9a64232ce947b,15.0,
8,00012e37b74c59a8911514c894402406,2016-03-03 11:32:55,91174a537d79a0c7,a,checkin,180.0,dfdff179047ea086,,1.0
9,000145fbe6915c7c8acb3a43537ad048,2016-03-03 22:33:53,a795756dbad3f61f,b,checkin,150.0,ec0bad0075f48b43,,1.0


In [6]:
df.dtypes

uuid                       object
timestamp          datetime64[ns]
session_id                 object
group                      object
action                     object
checkin                   float64
page_id                    object
n_results                 float64
result_position           float64
dtype: object

In [7]:
# Before starting we will check a few things. 
# First we would like to ensure that the variable uuid is a unique identifier for each row.
k = df['uuid'].duplicated().sum()
print('the number of repeated value in uuid is: ', k )

the number of repeated value in uuid is:  0


In [8]:
#  we want to check that there are indeed only two groups present in the dataset.
df['group'].unique()

array(['b', 'a'], dtype=object)

In [9]:
# we will check that there are no missing data for the variables uuid, timestamp, session_id, group and action.
for col in  df.columns:
    print(col, df[col].isnull().sum())

uuid 0
timestamp 0
session_id 0
group 0
action 0
checkin 176341
page_id 0
n_results 263931
result_position 169683


In [10]:
# erring on the side of caution, we want to explicitely check that any given session belongs to a unique group. 
# The following result gives the number of sessions that belong to more than one group, and should be zero.
unique_session_group = df[['session_id', 'group']].drop_duplicates()
k = len(unique_session_group['session_id'].drop_duplicates())-unique_session_group.shape[0]
print('number of session belong to more than one group is: ' , k)

number of session belong to more than one group is:  0
