# Load & Info

In [None]:
import pandas as pd
import numpy as np

# Load the sessions dataset
sessions = pd.read_csv('../data/sessions.csv')
train_user = pd.read_csv('../data/train_user1.csv')

In [None]:
# Explore the dataset
print(sessions.head())

In [None]:
print(sessions.info())

# Count Occurrences

In [None]:
total_actions = len(sessions)
print("Total Number of Actions:", total_actions)

In [None]:
# Group by 'action' column and count occurrences
action_counts = sessions["action"].count()

# Show the action counts
action_counts

In [None]:
sessions['action'].value_counts().sum()

In [None]:
sessions['action'].value_counts().head(50)

# Number of Action

In [None]:
unique_actions = sessions['action'].nunique()
print("Number of Unique Action Types:", unique_actions)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# # Get the top 15 actions
# top_actions = sessions['action'].value_counts()

# plt.subplots(figsize=(50, 50))
# sns.barplot(y=top_actions.index, x=top_actions.values)
# plt.title("Top 50 Actions in Sessions", size=15)
# plt.xlabel("Count", size=12)
# plt.ylabel("Action", size=12)
# plt.show()

In [None]:
top_actions = sessions['action'].value_counts().head(1).index.tolist()
filtered_sessions = sessions[sessions['action'].isin(top_actions)]

filtered_sessions

In [None]:
# filtered_sessions.to_csv('filtered_sessions.csv',index=False)

# Take the most frequent action of the user

In [None]:
# import pandas as pd

# # Group by user_id and calculate the mode of action for each user
# most_frequent_actions = sessions.groupby('user_id')['action'].apply(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).reset_index()

# # Rename columns
# most_frequent_actions.columns = ['user_id', 'most_frequent_action']

# # Display the result
# print(most_frequent_actions)

# Take the most frequent action of the user (1)

In [None]:
import pandas as pd

# Group by user_id and calculate the mode of action for each user
most_frequent_action = sessions.groupby('user_id')['action'].apply(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).reset_index()
most_frequent_action_type = sessions.groupby('user_id')['action_type'].apply(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).reset_index()
most_frequent_action_detail = sessions.groupby('user_id')['action_detail'].apply(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).reset_index()
most_frequent_device_type = sessions.groupby('user_id')['device_type'].apply(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).reset_index()
most_frequent_secs_elapsed = sessions.groupby('user_id')['secs_elapsed'].apply(lambda x: x.mode()[0] if len(x.mode()) > 0 else None).reset_index()

# Rename columns
most_frequent_action.columns = ['user_id', 'most_frequent_action']
most_frequent_action_type.columns = ['user_id', 'most_frequent_action_type']
most_frequent_action_detail.columns = ['user_id', 'most_frequent_action_detail']
most_frequent_device_type.columns = ['user_id', 'most_frequent_device_type']
most_frequent_secs_elapsed.columns = ['user_id', 'most_frequent_secs_elapsed']

data1 = most_frequent_action.merge(most_frequent_action_type, left_on='user_id', right_on='user_id', how='inner')
data2 = data1.merge(most_frequent_action_detail, left_on='user_id', right_on='user_id', how='inner')
data3 = data2.merge(most_frequent_device_type, left_on='user_id', right_on='user_id', how='inner')
most_frequent_actions = data3.merge(most_frequent_secs_elapsed, left_on='user_id', right_on='user_id', how='inner')

# Display the result
print(most_frequent_actions)

In [None]:
#Checking null values
most_frequent_actions.isnull().sum()

In [None]:
mode_value1 = most_frequent_actions['most_frequent_action'].mode()[0]
mode_value2 = most_frequent_actions['most_frequent_action_type'].mode()[0]
mode_value3 = most_frequent_actions['most_frequent_action_detail'].mode()[0]
mode_value4 = most_frequent_actions['most_frequent_secs_elapsed'].mode()[0]

# Impute missing values with the mode
most_frequent_actions['most_frequent_action'].fillna(mode_value1, inplace=True)
most_frequent_actions['most_frequent_action_type'].fillna(mode_value2, inplace=True)
most_frequent_actions['most_frequent_action_detail'].fillna(mode_value3, inplace=True)
most_frequent_actions['most_frequent_secs_elapsed'].fillna(mode_value4, inplace=True)

In [None]:
# # Assuming most_frequent_actions is the DataFrame containing user_id and most_frequent_action
# # Calculate the frequency of each action type
# action_frequency = most_frequent_actions['most_frequent_action'].value_counts()

# # Sort actions by frequency in descending order
# sorted_actions = action_frequency.index

# plt.subplots(figsize=(100, 100))
# sns.countplot(y='most_frequent_action', data=most_frequent_actions, order=sorted_actions)
# plt.title("Most Frequent Actions", size=13)
# plt.show()

In [None]:
most_frequent_actions.to_csv('../data/most_frequent_actions.csv',index=False)

# Cleaning

In [None]:
print(sessions.shape)

sessions = sessions.dropna(subset = ['user_id'])

print(sessions.shape)

In [None]:
sessions['user_id'].nunique()

In [None]:
sessions['device_type'].unique()

In [None]:
#https://stackoverflow.com/questions/34776651/concatenate-rows-of-pandas-dataframe-with-same-id

session_df_concat = sessions.groupby('user_id', as_index=False).agg(lambda x: x.tolist())

print(session_df_concat.shape)

session_df_concat.head()

In [None]:
# Function to convert list into strings

import re

def abcd(action):
    
    """
    Function to convert list into strings
    
    parameters: action 
    
    returns : action  
    
    """
    action = [ str(i) for i in action ]
    
    action = [ re.sub('nan','',i) for i in action ] 
    
    action = ','.join(action)
    
    return action

In [None]:
session_df_concat['action'] = session_df_concat['action'].apply(abcd)

session_df_concat['action'].head()

In [None]:
session_df_concat['action_type'] = session_df_concat['action_type'].apply(abcd)

session_df_concat['action_type'].head()

In [None]:
session_df_concat['action_detail'] = session_df_concat['action_detail'].apply(abcd)

session_df_concat['action_detail'].head()

In [None]:
# Function to convert list into strings

def efgh(device):
    
    """
    Function to convert list into strings
    
    parameters: device 
    
    returns : device  
    
    """
    
    device = [ str(i) for i in device ]
    
    device = [ re.sub('nan','',i) for i in device ] 
                
    device = ','.join(set(device))
    
    return device

In [None]:
session_df_concat['device_type'] = session_df_concat['device_type'].apply(efgh)

session_df_concat['device_type'].head()

In [None]:
# Function to convert list into strings

def ijkl(time):
    
    """
    Function to convert list into strings
    
    parameters: time 
    
    returns : time  
    
    """
    
    float_time = []
    
    time = [ str(i) for i in time ]
    
    time = [ re.sub('nan','',i) for i in time ] 
        
    for i in time:
        
         try:
                
                float_time.append(float(i))
         
         except ValueError:
                
                continue
    
    float_time = sum(float_time)
    
    return float_time

In [None]:
session_df_concat['secs_elapsed'] = session_df_concat['secs_elapsed'].apply(ijkl)

session_df_concat['secs_elapsed'].head()

In [None]:
print(session_df_concat.shape)

session_df_concat.head()

## Join train and session df

In [None]:
train_merge = train_user.merge(most_frequent_actions, left_on='id', right_on='user_id', how='inner')

print("Train  :",train_user.shape)

print("Session:",most_frequent_actions.shape)

print("Merge  :",train_merge.shape)

print("No of users in Train Data with session info:",train_merge.shape[0])

print("{} / {} = {}".format(train_merge.shape[0],train_user.shape[0],np.round((train_merge.shape[0]/train_user.shape[0]),2)))

In [None]:
print(train_merge.columns)

train_merge.head()

In [None]:
train_merge

In [None]:
#Checking null values
train_merge.isnull().sum()

## Delete Missing Value

In [None]:
train_merge.dropna(subset=['most_frequent_action'], inplace=True)

In [None]:
#Checking null values
train_merge.isnull().sum()

## Drop columns

In [None]:
train_merge = train_merge.drop(columns=['id','user_id'])

In [None]:
train_merge

In [None]:
# train_merge = train_merge.drop(['date_account_created','timestamp_first_active','date_first_booking'], axis=1)

In [None]:
# saving
train_merge.to_csv('../data/train_merge.csv',index=False)