In [None]:
import pandas as pd

# Load the dataset
file_path = r'C:\Users\AKHILA\OneDrive\Desktop\mini_projsample_google_analytics_data.csv'
data = pd.read_csv(file_path)

# Function for noise reduction
def reduce_noise(data, pageviews_threshold=20, sessions_threshold=15, users_threshold=10):
    """
    Reduces noise in web analytics data by filtering rows with metrics below thresholds.
    
    Parameters:
        data (pd.DataFrame): The input dataset.
        pageviews_threshold (int): Minimum pageviews to retain a row.
        sessions_threshold (int): Minimum sessions to retain a row.
        users_threshold (int): Minimum users to retain a row.
    
    Returns:
        pd.DataFrame: Cleaned dataset.
    """
    # Filter rows based on thresholds
    filtered_data = data[
        (data['pageviews'] >= pageviews_threshold) &
        (data['sessions'] >= sessions_threshold) &
        (data['users'] >= users_threshold)
    ]
    
    # Sort the filtered data for easier analysis
    filtered_data = filtered_data.sort_values(by='pageviews', ascending=False).reset_index(drop=True)
    
    return filtered_data

# Apply the noise-reduction function to the dataset
cleaned_data = reduce_noise(data)

# Display the cleaned data
print("Cleaned Data:")
print(cleaned_data)


  from pandas.core import (


Cleaned Data:
    pagePath previousPagePath  pageviews  sessions  users
0      /home        /products        120        90     85
1      /home                /        100        80     70
2  /products           /about         80        70     65
3     /about                /         60        50     45
4     /about            /home         50        40     35
5  /products            /home         40        35     30
6   /contact            /home         30        25     20


In [3]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

# Load the dataset
file_path = r'C:\Users\ssp1_\Downloads\sample_google_analytics_data.csv'
data = pd.read_csv(file_path)

# Function for outlier detection using Isolation Forest
def detect_outliers(data, contamination=0.1):
    """
    Detects outliers in numerical data using Isolation Forest.
    
    Parameters:
        data (pd.DataFrame): The input dataset.
        contamination (float): Proportion of outliers in the data.
    
    Returns:
        pd.DataFrame: Dataset with an added 'outlier' column.
    """
    # Selecting numerical columns
    numerical_data = data[['pageviews', 'sessions', 'users']]
    
    # Apply Isolation Forest
    model = IsolationForest(contamination=contamination, random_state=42)
    data['outlier'] = model.fit_predict(numerical_data)
    
    # Retain non-outliers (outlier = 1)
    filtered_data = data[data['outlier'] == 1].drop(columns=['outlier']).reset_index(drop=True)
    return filtered_data

# Function for clustering using KMeans
def cluster_data(data, n_clusters=2):
    """
    Groups similar patterns in the data using KMeans clustering.
    
    Parameters:
        data (pd.DataFrame): The input dataset.
        n_clusters (int): Number of clusters.
    
    Returns:
        pd.DataFrame: Dataset with an added 'cluster' column.
    """
    # Selecting numerical columns
    numerical_data = data[['pageviews', 'sessions', 'users']]
    
    # Apply KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data['cluster'] = kmeans.fit_predict(numerical_data)
    
    return data

# Step 1: Detect and remove outliers
data_no_outliers = detect_outliers(data)

# Step 2: Cluster the cleaned data
clustered_data = cluster_data(data_no_outliers)

# Display the final processed data
print("Final Processed Data:")
print(clustered_data)


Final Processed Data:
    pagePath previousPagePath  pageviews  sessions  users  cluster
0      /home                /        100        80     70        1
1     /about            /home         50        40     35        0
2   /contact            /home         30        25     20        0
3  /products            /home         40        35     30        0
4  /products           /about         80        70     65        1
5     /about                /         60        50     45        1


