# EDA on Customer Activity Data

### This notebook will focus on the Exploratory Data Analysis of the Customer Activity Dataset

## Necessary Imports

In [1]:
import pandas as pandas
import numpy as np
import yaml
from sqlalchemy import create_engine
from db_utils import RDSDatabaseConnector
from transformations import DataTransform, DataFrameInfo

## Loading in the dataset and connecting to the database

The customer_activity_data csv file extracted from the AWS RDS was previously extracted and saved locally. This is now loaded in as a pandas dataframe to allow use of EDA techniques.

In [2]:
def loads_credentials():
    with open('credentials.yaml', 'r') as file:
     credentials = yaml.safe_load(file)
    return credentials

credentials = loads_credentials()
connector = RDSDatabaseConnector(credentials)

## Extract the data

In [3]:
df = connector.extract_data()
print("Data extracted successfully")
print(df.head())

Data extracted successfully
   administrative  administrative_duration  informational  \
0             0.0                      0.0              0   
1             0.0                      0.0              0   
2             2.0                     99.4              0   
3             0.0                      0.0              0   
4             0.0                      0.0              0   

   informational_duration  product_related  product_related_duration  \
0                     0.0              4.0                       0.0   
1                     0.0             26.0                     876.0   
2                     0.0             19.0                     368.0   
3                     0.0             20.0                    1432.0   
4                     0.0             33.0                     694.0   

   bounce_rates  exit_rates  page_values month operating_systems  \
0        0.2000       0.200          0.0   May           Android   
1        0.0000       0.026         

## Save the data to a CSV file

In [4]:
file_path = 'customer_activity_data.csv'
connector._save_date(df, file_path )
print(f"Data saved to {file_path}")

Data saved to customer_activity_data.csv


## Load the Dataframe from your local machine into a Pandas dataframe

In [5]:
loaded_df = connector._load_data(file_path)
print("Data loaded from CSV successfully")
print(loaded_df.head())

Data loaded from CSV successfully
   administrative  administrative_duration  informational  \
0             0.0                      0.0              0   
1             0.0                      0.0              0   
2             2.0                     99.4              0   
3             0.0                      0.0              0   
4             0.0                      0.0              0   

   informational_duration  product_related  product_related_duration  \
0                     0.0              4.0                       0.0   
1                     0.0             26.0                     876.0   
2                     0.0             19.0                     368.0   
3                     0.0             20.0                    1432.0   
4                     0.0             33.0                     694.0   

   bounce_rates  exit_rates  page_values month operating_systems  \
0        0.2000       0.200          0.0   May           Android   
1        0.0000       0.026   

## Initialise DataTransform and DataFrameInfo and apply transformations

In [6]:
transformer = DataTransform(loaded_df)
transformer._convert_to_category()
print("Columns converted to categorical")

info = DataFrameInfo(loaded_df)

print("Describing columns:")
describe_columns = info._describe_columns()
print(describe_columns)

print("Display statistics:")
stats = info._extract_stats()
print(stats)

print("Count distinct values in categorical columns:")
distinct_values = info._count_distinct_values()
print(distinct_values)

print("Shape of the dataframe:")
shape_of_data = info._shape_of_df()
print(shape_of_data)

print("Percentage of nulls:")
nulls = info._count_nulls()
print(nulls)




Columns converted to categorical
Describing columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   administrative            11760 non-null  category
 1   administrative_duration   11418 non-null  float64 
 2   informational             12330 non-null  category
 3   informational_duration    11994 non-null  float64 
 4   product_related           11751 non-null  category
 5   product_related_duration  12129 non-null  float64 
 6   bounce_rates              12330 non-null  float64 
 7   exit_rates                12330 non-null  float64 
 8   page_values               12330 non-null  float64 
 9   month                     12330 non-null  object  
 10  operating_systems         12319 non-null  object  
 11  browser                   12330 non-null  object  
 12  region                    12330 non-null  object 

## Nulls