In [1]:
# import necessary libraries
import numpy as np
import pandas as pd

In [None]:
# load the dataset
df=pd.read_csv("/kaggle/input/online-store-customer-data/online_store_customer_data.csv")

In [None]:
# load 1st 5 rows
df.head()

In [None]:
# load last 5 rows
df.tail()

In [None]:
# determine the shape of data
df.shape

In [None]:
# determine the size of data
df.size

In [None]:
# check column names
df.columns

In [None]:
# checking null values
df.isnull().sum()

In [None]:
# dropping null values
df.dropna(inplace=True)

In [None]:
# checking change in shape after dropping null values
df.shape

In [None]:
# checking change in size after dropping null values
df.size

In [None]:
# checking duplicate values
df.duplicated().value_counts()

In [None]:
# dropping duplicate values
df.drop_duplicates(inplace=True)

In [None]:
# checking the count after dropping duplicate values
df.duplicated().value_counts()

In [None]:
# checking the change in shape after dropping the duplicate values
df.shape

In [None]:
# checking the change in size after dropping the duplicate values
df.size

In [None]:
# determining the datatypes
df.info()

In [None]:
# changing the datatype of date from object to datetime
df['Transaction_date']=df['Transaction_date'].apply(pd.to_datetime)

In [None]:
# changing the datatype of age and referal from float to integer
df['Age']=df['Age'].astype(int)
df['Referal']=df['Referal'].astype(int)

In [None]:
# checking the change in datatypes
df.info()

In [None]:
# checking the change in values of dataset
df.head()

In [None]:
# statistical summary of age and amount_spent
df.drop(columns=['Transaction_ID', 'Referal']).describe()

In [None]:
# checking the unique values in each column
df.nunique()

In [None]:
# correcting the index
df.reset_index(inplace=True)

In [None]:
df.drop(columns=['index', 'Transaction_ID'], inplace=True)

In [None]:
# data visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# Univariate analysis for Transaction_date
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="Transaction_date", color='red', bins=30)
plt.xlabel("Transaction Date")
plt.ylabel("Frequency")
plt.title("Distribution of Transactions by Date")

In [None]:
# Univariate analysis for Age
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="Age", color='green', bins=30)
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.title("Distribution of Customers by Age")

In [None]:
# Univariate analysis for Marital_status
plt.figure(figsize=(6, 6))
sns.countplot(data=df, x="Marital_status")
plt.xlabel("Marital Status")
plt.ylabel("Count")
plt.title("Distribution of Customers by Marital Status")

In [None]:
# Univariate analysis for State_names
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x="State_names")
plt.xticks(rotation=90)
plt.xlabel("State")
plt.ylabel("Count")
plt.title("Distribution of Customers by State")

In [None]:
# Univariate analysis for Gender
plt.figure(figsize=(6, 6))
sns.countplot(data=df, x="Gender")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.title("Distribution of Customers by Gender")

In [None]:
# Univariate analysis for Segment
plt.figure(figsize=(6, 6))
sns.countplot(data=df, x="Segment")
plt.xlabel("Segment")
plt.ylabel("Count")
plt.title("Distribution of Customers by Segment")

In [None]:
# Univariate analysis for Employees_status
plt.figure(figsize=(6, 6))
sns.countplot(data=df, x="Employees_status")
plt.xlabel("Employment Status")
plt.ylabel("Count")
plt.title("Distribution of Customers by Employment Status")

In [None]:
# Univariate analysis for Payment_method
plt.figure(figsize=(6, 6))
sns.countplot(data=df, x="Payment_method")
plt.xlabel("Payment Method")
plt.ylabel("Count")
plt.title("Distribution of Transactions by Payment Method")

In [None]:
# Univariate analysis for Referal
plt.figure(figsize=(6, 6))
sns.countplot(data=df, x="Referal")
plt.xlabel("Referral")
plt.ylabel("Count")
plt.title("Distribution of Transactions by Referral")

In [None]:
# Univariate analysis for Amount_spent
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="Amount_spent", color='violet', bins=30)
plt.xlabel("Amount Spent")
plt.ylabel("Frequency")
plt.title("Distribution of Transactions by Amount Spent")

In [None]:
# BIVARIATE ANALYSIS OF segment and marital status
# Group the data by segment, and calculate the count of each marital status group
grouped = df.groupby(['Segment'])['Marital_status'].value_counts().unstack(fill_value=0)

# Plot the stacked bar chart
grouped.plot(kind='bar', stacked=True)

# Add axis labels and title
plt.xlabel('Segment')
plt.ylabel('Count')
plt.title('Segment Distribution by Marital Status')

# Show the plot
plt.show()

In [None]:
# BIVARIATE ANALYSIS OF segment and gender
# Group the data by segment, and calculate the count of each gender group
grouped = df.groupby(['Segment'])['Gender'].value_counts().unstack(fill_value=0)

# Plot the stacked bar chart
grouped.plot(kind='bar', color=['orange', 'green'], stacked=True)

# Add axis labels and title
plt.xlabel('Segment')
plt.ylabel('Count')
plt.title('Segment Distribution by Gender')

# Show the plot
plt.show()

In [None]:
# BIVARIATE ANALYSIS OF segment and referal
# Group the data by segment, and calculate the count of each referal group
grouped = df.groupby(['Segment'])['Referal'].value_counts().unstack(fill_value=0)

# Plot the stacked bar chart
grouped.plot(kind='bar', color=['magenta', 'purple'], stacked=True)

# Add axis labels and title
plt.xlabel('Segment')
plt.ylabel('Count')
plt.title('Segment Distribution by Referal')

# Show the plot
plt.show()

In [None]:
# BIVARIATE ANALYSIS OF employees status and marital status
# Group the data by employees status, and calculate the count of each marital status
grouped = df.groupby(['Employees_status'])['Marital_status'].value_counts().unstack(fill_value=0)

# Plot the stacked bar chart
grouped.plot(kind='bar', color=['red', 'yellow'], stacked=True)

# Add axis labels and title
plt.xlabel('Employees Status')
plt.ylabel('Count')
plt.title('Employees Distribution by Marital Status')

# Show the plot
plt.show()

In [None]:
# BIVARIATE ANALYSIS OF employees status and gender
# Group the data by employees status, and calculate the count of each gender group
grouped = df.groupby(['Employees_status'])['Gender'].value_counts().unstack(fill_value=0)

# Plot the stacked bar chart
grouped.plot(kind='bar', color=['pink', 'grey'], stacked=True)

# Add axis labels and title
plt.xlabel('Employees status')
plt.ylabel('Count')
plt.title('Employees Distribution by Gender')

# Show the plot
plt.show()

In [None]:
# BIVARIATE ANALYSIS OF employee status and referal
# Group the data by employees status, and calculate the count of each referal group
grouped = df.groupby(['Employees_status'])['Referal'].value_counts().unstack(fill_value=0)

# Plot the stacked bar chart
grouped.plot(kind='bar', color=['indigo', 'violet'], stacked=True)

# Add axis labels and title
plt.xlabel('Employees status')
plt.ylabel('Count')
plt.title('Employees Distribution by Referal')

# Show the plot
plt.show()

In [None]:
# BIVARIATE ANALYSIS OF payment method and marital status
# Group the data by payment method, and calculate the count of each marital status
grouped = df.groupby(['Payment_method'])['Marital_status'].value_counts().unstack(fill_value=0)

# Plot the stacked bar chart
grouped.plot(kind='bar', color=['cyan', 'blue'], stacked=True)

# Add axis labels and title
plt.xlabel('Payment Method')
plt.ylabel('Count')
plt.title('Payment Distribution by Marital Status')

# Show the plot
plt.show()

In [None]:
# BIVARIATE ANALYSIS OF payment method and gender
# Group the data by payment, and calculate the count of each gender group
grouped = df.groupby(['Payment_method'])['Gender'].value_counts().unstack(fill_value=0)

# Plot the stacked bar chart
grouped.plot(kind='bar', color=['brown', 'yellow'], stacked=True)

# Add axis labels and title
plt.xlabel('Payment Method')
plt.ylabel('Count')
plt.title('Payment method Distribution by Gender')

# Show the plot
plt.show()

In [None]:
# BIVARIATE ANALYSIS OF payment method and referal
# Group the data by payment, and calculate the count of each referal group
grouped = df.groupby(['Payment_method'])['Referal'].value_counts().unstack(fill_value=0)

# Plot the stacked bar chart
grouped.plot(kind='bar', color=['grey', 'black'], stacked=True)

# Add axis labels and title
plt.xlabel('Payment Method')
plt.ylabel('Count')
plt.title('Payment method Distribution by Referal')

# Show the plot
plt.show()

In [None]:
# BIVARIATE ANALYSIS OF segment and amount spent
# Create a bar chart of the amount spent by segment
segment_amounts = df.groupby('Segment')['Amount_spent'].sum()
segment_amounts.plot(kind='bar', xlabel='Segment', ylabel='Amount Spent', title='Amount Spent by Segment', color='purple')
plt.show()

In [None]:
# BIVARIATE ANALYSIS OF state and amount spent
# Create a bar chart of the amount spent by state
segment_amounts = df.groupby('State_names')['Amount_spent'].sum().sort_values(ascending=False)
segment_amounts.plot(kind='bar', xlabel='State', ylabel='Amount Spent', figsize=(10,6), title='Amount Spent by State', color='cyan')
plt.show()