# Load & Info

In [1]:
import pandas as pd
import numpy as np

# Load the sessions dataset
sessions = pd.read_csv('../data/sessions.csv')
train_user = pd.read_csv('../data/train_user.csv')

In [None]:
# Explore the dataset
print(sessions.head())

In [None]:
print(sessions.info())

# Count Occurrences

In [None]:
total_actions = len(sessions)
print("Total Number of Actions:", total_actions)

In [None]:
# Group by 'action' column and count occurrences
action_counts = sessions["action"].count()

# Show the action counts
action_counts

In [None]:
sessions['action'].value_counts().sum()

In [None]:
sessions['action'].value_counts().head(50)

# Number of Action

In [None]:
unique_actions = sessions['action'].nunique()
print("Number of Unique Action Types:", unique_actions)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Get the top 15 actions
top_actions = sessions['action'].value_counts().head(50)

plt.subplots(figsize=(50, 50))
sns.barplot(y=top_actions.index, x=top_actions.values)
plt.title("Top 50 Actions in Sessions", size=15)
plt.xlabel("Count", size=12)
plt.ylabel("Action", size=12)
plt.show()

In [None]:
top_actions = sessions['action'].value_counts().head(1).index.tolist()
filtered_sessions = sessions[sessions['action'].isin(top_actions)]

filtered_sessions

In [None]:
# filtered_sessions.to_csv('filtered_sessions.csv',index=False)

In [None]:
plt.subplots(figsize=(10, 8))
sns.countplot(y='action', data=filtered_sessions)
plt.title("Top 50 Most Frequent Actions", size=13)
plt.show()

# Take the most frequent action of the user

In [2]:
import pandas as pd

# Group by user_id and calculate the mode of action for each user
most_frequent_actions = sessions.groupby('user_id')['action'].apply(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).reset_index()

# Rename columns
most_frequent_actions.columns = ['user_id', 'most_frequent_action']

# Display the result
print(most_frequent_actions)

           user_id most_frequent_action
0       00023iyk9l                 show
1       0010k6l0om                 show
2       001wyh0pz8               search
3       0028jgx1x1                 show
4       002qnbzfs5                 show
...            ...                  ...
135478  zzxox7jnrx       search_results
135479  zzy7t0y9cm          personalize
135480  zzysuoqg6x               create
135481  zzywmcn0jv                 show
135482  zzzlylp57e                 show

[135483 rows x 2 columns]


In [None]:
plt.subplots(figsize=(100, 100))
sns.countplot(y='most_frequent_action', data=most_frequent_actions)
plt.title("Most Frequent Actions", size=13)
plt.show()

In [3]:
most_frequent_actions.to_csv('../data/most_frequent_actions.csv',index=False)

# Cleaning

In [None]:
print(sessions.shape)

sessions = sessions.dropna(subset = ['user_id'])

print(sessions.shape)

In [None]:
sessions['user_id'].nunique()

In [None]:
sessions['device_type'].unique()

In [None]:
#https://stackoverflow.com/questions/34776651/concatenate-rows-of-pandas-dataframe-with-same-id

session_df_concat = sessions.groupby('user_id', as_index=False).agg(lambda x: x.tolist())

print(session_df_concat.shape)

session_df_concat.head()

In [None]:
# Function to convert list into strings

import re

def abcd(action):
    
    """
    Function to convert list into strings
    
    parameters: action 
    
    returns : action  
    
    """
    action = [ str(i) for i in action ]
    
    action = [ re.sub('nan','',i) for i in action ] 
    
    action = ','.join(action)
    
    return action

In [None]:
session_df_concat['action'] = session_df_concat['action'].apply(abcd)

session_df_concat['action'].head()

In [None]:
session_df_concat['action_type'] = session_df_concat['action_type'].apply(abcd)

session_df_concat['action_type'].head()

In [None]:
session_df_concat['action_detail'] = session_df_concat['action_detail'].apply(abcd)

session_df_concat['action_detail'].head()

In [None]:
# Function to convert list into strings

def efgh(device):
    
    """
    Function to convert list into strings
    
    parameters: device 
    
    returns : device  
    
    """
    
    device = [ str(i) for i in device ]
    
    device = [ re.sub('nan','',i) for i in device ] 
                
    device = ','.join(set(device))
    
    return device

In [None]:
session_df_concat['device_type'] = session_df_concat['device_type'].apply(efgh)

session_df_concat['device_type'].head()

In [None]:
# Function to convert list into strings

def ijkl(time):
    
    """
    Function to convert list into strings
    
    parameters: time 
    
    returns : time  
    
    """
    
    float_time = []
    
    time = [ str(i) for i in time ]
    
    time = [ re.sub('nan','',i) for i in time ] 
        
    for i in time:
        
         try:
                
                float_time.append(float(i))
         
         except ValueError:
                
                continue
    
    float_time = sum(float_time)
    
    return float_time

In [None]:
session_df_concat['secs_elapsed'] = session_df_concat['secs_elapsed'].apply(ijkl)

session_df_concat['secs_elapsed'].head()

In [None]:
print(session_df_concat.shape)

session_df_concat.head()

## Join train and session df

In [None]:
train_merge = train_user.merge(session_df_concat, left_on='id', right_on='user_id', how='inner')

print("Train  :",train_user.shape)

print("Session:",session_df_concat.shape)

print("Merge  :",train_merge.shape)

print("No of users in Train Data with session info:",train_merge.shape[0])

print("{} / {} = {}".format(train_merge.shape[0],train_user.shape[0],np.round((train_merge.shape[0]/train_user.shape[0]),2)))

In [None]:
print(train_merge.columns)

train_merge.head()

In [None]:
train_merge

In [None]:
#Checking null values
train_merge.isnull().sum()

In [None]:
# saving
train_merge.to_csv('train_merge_raw.csv',index=False)