<a href="https://colab.research.google.com/github/Anu-jo/cross_selling/blob/main/clustering_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

# Load the dataset
file_path = 'Test.csv'
df = pd.read_csv(file_path)

# Select numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns

# Initial Summary

# Check for missing values
missing_values_before = df.isna().sum()

# Check for skewness
skewness_before = df[numeric_cols].skew()

# Detect outliers using the IQR method
def detect_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR)))
    return outliers

outliers_iqr_before = df[numeric_cols].apply(detect_outliers_iqr, axis=0)
outliers_count_iqr_before = outliers_iqr_before.sum()

# Detect outliers using Z-score
z_scores_before = np.abs(stats.zscore(df[numeric_cols].dropna()))
outliers_zscore_before = (z_scores_before > 3).sum(axis=0)

# Handling Missing Values

# Technique 1: Imputation with Mean/Median/Mode
for col in numeric_cols:
    df[col].fillna(df[col].mean(), inplace=True)  # Mean imputation for numeric columns

for col in non_numeric_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)  # Mode imputation for non-numeric columns

# Technique 2: Model-Based Imputation (KNN)
knn_imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = knn_imputer.fit_transform(df[numeric_cols])

# Ensure no missing values remain
df.fillna(method='ffill', inplace=True)  # Forward fill as a last resort
df.fillna(method='bfill', inplace=True)  # Backward fill as a last resort

# Handling Outliers

# Technique 1: Clipping
def clip_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data.clip(lower_bound, upper_bound)

df[numeric_cols] = df[numeric_cols].apply(clip_outliers)

# Technique 2: Transformation (Log Transformation)
# Apply log transformation to reduce the effect of outliers
df[numeric_cols] = df[numeric_cols].apply(lambda x: np.log1p(x))

# After Treatment: Summary Checks

# Check for missing values
missing_values_after = df.isna().sum()

# Check for skewness
skewness_after = df[numeric_cols].skew()

# Detect outliers using the IQR method after treatment
outliers_iqr_after = df[numeric_cols].apply(detect_outliers_iqr, axis=0)
outliers_count_iqr_after = outliers_iqr_after.sum()

# Detect outliers using Z-score after treatment
z_scores_after = np.abs(stats.zscore(df[numeric_cols].dropna()))
outliers_zscore_after = (z_scores_after > 3).sum(axis=0)

"""# Summary Comparison

# Combine the before and after summaries into DataFrames
summary_missing_values = pd.DataFrame({
    'Before Treatment': missing_values_before,
    'After Treatment': missing_values_after
})
summary_skewness = pd.DataFrame({
    'Before Treatment': skewness_before,
    'After Treatment': skewness_after
})
summary_outliers_iqr = pd.DataFrame({
    'Before Treatment': outliers_count_iqr_before,
    'After Treatment': outliers_count_iqr_after
})
summary_outliers_zscore = pd.DataFrame({
    'Before Treatment': outliers_zscore_before,
    'After Treatment': outliers_zscore_after
})

# Print the summaries
print("\nMissing Values Summary:\n", summary_missing_values[summary_missing_values.sum(axis=1) > 0])
print("\nSkewness Summary:\n", summary_skewness[(summary_skewness['Before Treatment'].abs() > 1) | (summary_skewness['After Treatment'].abs() > 1)])
print("\nOutliers (IQR method) Summary:\n", summary_outliers_iqr[summary_outliers_iqr.sum(axis=1) > 0])
print("\nOutliers (Z-score method) Summary:\n", summary_outliers_zscore[summary_outliers_zscore.sum(axis=1) > 0])
"""
# Clustering

# Identify numerical and categorical columns again after preprocessing
numerical_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(exclude=[np.number]).columns

# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Apply the preprocessing pipeline to the data
X_preprocessed = preprocessor.fit_transform(df)
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # Or 'median' or 'most_frequent' if appropriate
X_preprocessed  = imputer.fit_transform(X_preprocessed )

# K-Means Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(X_preprocessed)
kmeans_silhouette = silhouette_score(X_preprocessed, kmeans_labels)
print(f'K-Means Silhouette Score: {kmeans_silhouette}')

# Hierarchical Clustering
agglo = AgglomerativeClustering(n_clusters=5)
agglo_labels = agglo.fit_predict(X_preprocessed)
agglo_silhouette = silhouette_score(X_preprocessed, agglo_labels)
print(f'Agglomerative Clustering Silhouette Score: {agglo_silhouette}')

# DBSCAN Clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_preprocessed)
# Evaluate only non-noise points
if len(set(dbscan_labels)) > 1:
    dbscan_silhouette = silhouette_score(X_preprocessed[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1])
    print(f'DBSCAN Silhouette Score: {dbscan_silhouette}')
else:
    print('DBSCAN resulted in only one cluster.')


  df = pd.read_csv(file_path)
  result = getattr(ufunc, method)(*inputs, **kwargs)
