### Import libraries

In [1]:
import pandas as pd

## Load the Data

In [2]:
# Load the CSV file
df = pd.read_csv("./Data/website_wata.csv") 
df

Unnamed: 0,Page Views,Session Duration,Bounce Rate,Traffic Source,Time on Page,Previous Visits,Conversion Rate
0,5,11.051381,0.230652,Organic,3.890460,3,1.0
1,4,3.429316,0.391001,Social,8.478174,0,1.0
2,4,1.621052,0.397986,Organic,9.636170,2,1.0
3,5,3.629279,0.180458,Organic,2.071925,3,1.0
4,5,4.235843,0.291541,Paid,1.960654,5,1.0
...,...,...,...,...,...,...,...
1995,1,2.724513,0.207187,Referral,1.324206,2,1.0
1996,3,0.392856,0.095559,Organic,3.824416,1,1.0
1997,4,9.899823,0.446622,Organic,1.288675,1,1.0
1998,3,0.393319,0.278340,Paid,5.037584,2,1.0


In [3]:
df.columns

Index(['Page Views', 'Session Duration', 'Bounce Rate', 'Traffic Source',
       'Time on Page', 'Previous Visits', 'Conversion Rate'],
      dtype='object')

## Inspect the Data

In [4]:
print(df.head()) # First few rows

   Page Views  Session Duration  Bounce Rate Traffic Source  Time on Page  \
0           5         11.051381     0.230652        Organic      3.890460   
1           4          3.429316     0.391001         Social      8.478174   
2           4          1.621052     0.397986        Organic      9.636170   
3           5          3.629279     0.180458        Organic      2.071925   
4           5          4.235843     0.291541           Paid      1.960654   

   Previous Visits  Conversion Rate  
0                3              1.0  
1                0              1.0  
2                2              1.0  
3                3              1.0  
4                5              1.0  


In [5]:
print(df.info()) # Data types and non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Page Views        2000 non-null   int64  
 1   Session Duration  2000 non-null   float64
 2   Bounce Rate       2000 non-null   float64
 3   Traffic Source    2000 non-null   object 
 4   Time on Page      2000 non-null   float64
 5   Previous Visits   2000 non-null   int64  
 6   Conversion Rate   2000 non-null   float64
dtypes: float64(4), int64(2), object(1)
memory usage: 109.5+ KB
None


In [6]:
print(df.describe()) # Summary statistics

        Page Views  Session Duration  Bounce Rate  Time on Page  \
count  2000.000000       2000.000000  2000.000000   2000.000000   
mean      4.950500          3.022045     0.284767      4.027439   
std       2.183903          3.104518     0.159781      2.887422   
min       0.000000          0.003613     0.007868      0.068515   
25%       3.000000          0.815828     0.161986      1.935037   
50%       5.000000          1.993983     0.266375      3.315316   
75%       6.000000          4.197569     0.388551      5.414627   
max      14.000000         20.290516     0.844939     24.796182   

       Previous Visits  Conversion Rate  
count      2000.000000      2000.000000  
mean          1.978500         0.982065  
std           1.432852         0.065680  
min           0.000000         0.343665  
25%           1.000000         1.000000  
50%           2.000000         1.000000  
75%           3.000000         1.000000  
max           9.000000         1.000000  


## Handle Missing Values

In [7]:
print(df.isnull().sum()) # Count missing values per column

# Example: Impute missing 'Session Duration' with the median
df['Session Duration'].fillna(df['Session Duration'].median(), inplace=True)

# Example: Remove rows with missing 'Conversion Rate'
df.dropna(subset=['Conversion Rate'], inplace=True)

Page Views          0
Session Duration    0
Bounce Rate         0
Traffic Source      0
Time on Page        0
Previous Visits     0
Conversion Rate     0
dtype: int64


## Data type Conversion

In [8]:
# Example: Convert 'Page Views' to integer
df['Page Views'] = pd.to_numeric(df['Page Views'], errors='coerce').fillna(0).astype(int)

# Example: Convert 'Bounce Rate' and 'Conversion Rate' to float, after removing '%' symbol if present.
# df['Bounce Rate'] = df['Bounce Rate'].str.replace('%','', regex=False).astype(float) / 100
# df['Conversion Rate'] = df['Conversion Rate'].str.replace('%','', regex=False).astype(float) / 100

## Handle Outliers

In [9]:
# Example: Remove outliers in 'Session Duration' using IQR (Interquartile Range)
Q1 = df['Session Duration'].quantile(0.25)
Q3 = df['Session Duration'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Session Duration'] < (Q1 - 1.5 * IQR)) | (df['Session Duration'] > (Q3 + 1.5 * IQR)))]

## Clean categorical Data

In [10]:
# Example: Remove extra spaces and convert to lowercase
df['Traffic Source'] = df['Traffic Source'].str.strip().str.lower()

# Example: Replace variations of "google" with "google"
# df['Traffic Source'] = df['Traffic Source'].replace(['google search', 'google ads'], 'google')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Traffic Source'] = df['Traffic Source'].str.strip().str.lower()


## More cleaning

In [11]:
#Example of cleaning all percentage columns.
percentage_columns = ['Bounce Rate', 'Conversion Rate']
for col in percentage_columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.replace('%','', regex=False).astype(float) / 100
    else:
        df[col] = df[col]/100

#Ensure that all numerical columns are numerical.
numerical_columns = ['Page Views', 'Session Duration', 'Time on Page', 'Previous Visits']
for col in numerical_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

#Remove all rows that contain a NaN after the numerical conversion.
df = df.dropna()

#Ensure that all numerical columns are positive.
for col in numerical_columns:
    df = df[df[col] >= 0]

#Ensure that page views and previous visits are integers.
df['Page Views'] = df['Page Views'].astype(int)
df['Previous Visits'] = df['Previous Visits'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col]/100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce')


## Save the clean data

In [12]:
df.to_csv("./Data/clean_website_wata.csv", index=False)