In [None]:
# =============================================================================
# CELL 1: Install Required Libraries
# =============================================================================
!pip install streamlit flask flask-ngrok pyngrok pandas numpy matplotlib seaborn scikit-learn -q

# =============================================================================
# CELL 2: Setup ngrok for tunneling (Required for Colab)
# =============================================================================
# You need to get a free authtoken from https://ngrok.com/
# Sign up and get your authtoken, then run:
from pyngrok import ngrok

# Replace 'YOUR_NGROK_AUTHTOKEN' with your actual token
# ngrok.set_auth_token("YOUR_NGROK_AUTHTOKEN")

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.0/9.0 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# =============================================================================
# CELL 3: Create the Main Application File
# =============================================================================
%%writefile duplicate_remover_core.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
import warnings
import io
import base64

warnings.filterwarnings('ignore')


class TextDuplicateRemover:
    def __init__(self, n_clusters=10, similarity_threshold=0.8):
        self.n_clusters = n_clusters
        self.similarity_threshold = similarity_threshold
        self.vectorizer = None
        self.kmeans = None
        self.pca = None

    def create_text_features(self, texts):
        """
        Create numerical features from text using simple character and word counts
        """
        features = []
        for text in texts:
            text = str(text)
            feature_vector = [
                len(text),
                text.count(' '),
                sum(c.isupper() for c in text),
                sum(c.islower() for c in text),
                sum(c.isdigit() for c in text),
                sum(c in '.,;:!?()[]{}' for c in text),
                len(text.split()),
                len(set(text.split())),
                len(text) / max(1, len(text.split())),
            ]
            for char in 'abcdefghij':
                feature_vector.append(text.lower().count(char) / max(1, len(text)))
            features.append(feature_vector)
        return np.array(features)

    def find_duplicates(self, texts):
        """
        Find duplicate texts using K-means clustering
        """
        if len(texts) < 2:
            return []

        features = self.create_text_features(texts)
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(features)

        n_components = min(features_scaled.shape[0], features_scaled.shape[1], 10)
        if n_components < 1:
            return []

        self.pca = PCA(n_components=n_components)
        features_pca = self.pca.fit_transform(features_scaled)

        actual_n_clusters = min(self.n_clusters, len(texts))
        if actual_n_clusters < 2:
            return []

        self.kmeans = KMeans(n_clusters=actual_n_clusters, random_state=42, n_init=10)
        clusters = self.kmeans.fit_predict(features_pca)

        duplicates = []
        for cluster_id in range(actual_n_clusters):
            cluster_indices = np.where(clusters == cluster_id)[0]
            if len(cluster_indices) > 1:
                for i in range(len(cluster_indices)):
                    for j in range(i + 1, len(cluster_indices)):
                        idx1, idx2 = cluster_indices[i], cluster_indices[j]
                        dist = np.linalg.norm(features_pca[idx1] - features_pca[idx2])
                        similarity = 1 / (1 + dist)
                        if similarity > self.similarity_threshold:
                            found_group = False
                            for group in duplicates:
                                if idx1 in group:
                                    group.append(idx2)
                                    found_group = True
                                    break
                                elif idx2 in group:
                                    group.append(idx1)
                                    found_group = True
                                    break
                            if not found_group:
                                duplicates.append([idx1, idx2])

        duplicates = [list(set(group)) for group in duplicates if len(set(group)) > 1]
        return duplicates

    def visualize_clusters(self, texts):
        """
        Visualize text clusters and return figure
        """
        if len(texts) < 2:
            return None

        features = self.create_text_features(texts)
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(features)

        n_components = min(2, features_scaled.shape[0], features_scaled.shape[1])
        if n_components < 2:
            return None

        pca = PCA(n_components=n_components)
        features_2d = pca.fit_transform(features_scaled)

        actual_n_clusters = min(self.n_clusters, len(texts))
        if actual_n_clusters < 2:
            return None

        kmeans = KMeans(n_clusters=actual_n_clusters, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(features_2d)

        fig, ax = plt.subplots(figsize=(10, 6))
        scatter = ax.scatter(features_2d[:, 0], features_2d[:, 1], c=clusters, cmap='viridis', alpha=0.6)
        plt.colorbar(scatter, ax=ax, label='Cluster')
        ax.set_title('Text Clusters Visualization')
        ax.set_xlabel('Principal Component 1')
        ax.set_ylabel('Principal Component 2')
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        return fig


class TabularDuplicateRemover:
    def __init__(self, similarity_threshold=0.9):
        self.similarity_threshold = similarity_threshold
        self.scaler = StandardScaler()
        self.pca = None
        self.kmeans = None
        self.knn = None

    def find_exact_duplicates(self, df):
        """
        Find exact duplicate rows
        """
        if df.empty:
            return []

        duplicates = df.duplicated(keep=False)
        duplicate_rows = df[duplicates]
        duplicate_groups = []
        processed = set()

        for idx in duplicate_rows.index:
            if idx in processed:
                continue
            mask = (df == df.loc[idx]).all(axis=1)
            group = df[mask].index.tolist()
            if len(group) > 1:
                duplicate_groups.append(group)
                processed.update(group)

        return duplicate_groups

    def find_near_duplicates_kmeans(self, df, numeric_columns=None):
        """
        Find near-duplicate rows using K-means clustering
        """
        if numeric_columns is None:
            numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

        if not numeric_columns or len(df) < 2:
            return []

        data = df[numeric_columns].fillna(0)
        data_scaled = self.scaler.fit_transform(data)

        n_components = min(10, len(numeric_columns), len(df))
        if n_components < 1:
            return []

        self.pca = PCA(n_components=n_components)
        data_pca = self.pca.fit_transform(data_scaled)

        n_clusters = min(20, max(2, len(df) // 5))
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        clusters = self.kmeans.fit_predict(data_pca)

        duplicate_groups = []
        for cluster_id in range(n_clusters):
            cluster_indices = np.where(clusters == cluster_id)[0].tolist()
            if len(cluster_indices) > 1:
                for i in range(len(cluster_indices)):
                    for j in range(i + 1, len(cluster_indices)):
                        idx1, idx2 = cluster_indices[i], cluster_indices[j]
                        dist = np.linalg.norm(data_pca[idx1] - data_pca[idx2])
                        similarity = 1 / (1 + dist)
                        if similarity > self.similarity_threshold:
                            found_group = False
                            for group in duplicate_groups:
                                if idx1 in group:
                                    group.append(idx2)
                                    found_group = True
                                    break
                                elif idx2 in group:
                                    group.append(idx1)
                                    found_group = True
                                    break
                            if not found_group:
                                duplicate_groups.append([idx1, idx2])

        return [list(set(g)) for g in duplicate_groups if len(set(g)) > 1]

    def find_near_duplicates_knn(self, df, numeric_columns=None, n_neighbors=5):
        """
        Find near-duplicates using K-Nearest Neighbors
        """
        if numeric_columns is None:
            numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

        if not numeric_columns or len(df) < 2:
            return []

        data = df[numeric_columns].fillna(0)
        data_scaled = self.scaler.fit_transform(data)

        n_components = min(10, len(numeric_columns), len(df))
        if n_components < 1:
            return []

        self.pca = PCA(n_components=n_components)
        data_pca = self.pca.fit_transform(data_scaled)

        actual_n_neighbors = min(n_neighbors, len(data_pca))
        self.knn = NearestNeighbors(n_neighbors=actual_n_neighbors)
        self.knn.fit(data_pca)

        distances, indices = self.knn.kneighbors(data_pca)
        duplicate_groups = []
        processed = set()

        for i in range(len(data_pca)):
            if i in processed:
                continue
            group = [i]
            for j, dist in zip(indices[i], distances[i]):
                if j != i and j not in processed and dist < (1 - self.similarity_threshold):
                    group.append(j)
                    processed.add(j)
            if len(group) > 1:
                duplicate_groups.append(group)
                processed.add(i)

        return duplicate_groups

    def visualize_clusters(self, df, numeric_columns=None):
        """
        Visualize data clusters and return figure
        """
        if numeric_columns is None:
            numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

        if len(numeric_columns) < 2 or len(df) < 2:
            return None

        data = df[numeric_columns].fillna(0)
        data_scaled = self.scaler.fit_transform(data)

        pca = PCA(n_components=2)
        data_2d = pca.fit_transform(data_scaled)

        n_clusters = min(10, max(2, len(df) // 5))
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(data_2d)

        fig, ax = plt.subplots(figsize=(10, 6))
        scatter = ax.scatter(data_2d[:, 0], data_2d[:, 1], c=clusters, cmap='viridis', alpha=0.6)
        plt.colorbar(scatter, ax=ax, label='Cluster')
        ax.set_title('Data Clusters Visualization')
        ax.set_xlabel('Principal Component 1')
        ax.set_ylabel('Principal Component 2')
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        return fig


class ClassificationDuplicateDetector:
    def __init__(self, method='random_forest'):
        self.method = method
        self.model = None
        self.scaler = StandardScaler()
        self.pca = None
        self.label_encoder = LabelEncoder()
        self.is_trained = False

    def create_training_data(self, df, numeric_columns=None):
        """
        Create training data for duplicate detection
        """
        if numeric_columns is None:
            numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

        if len(numeric_columns) < 1 or len(df) < 2:
            return None, None

        data = df[numeric_columns].fillna(0)
        pairs = []
        labels = []

        for i in range(len(data)):
            for j in range(i + 1, len(data)):
                if (data.iloc[i] == data.iloc[j]).all():
                    pairs.append(np.concatenate([data.iloc[i].values, data.iloc[j].values]))
                    labels.append(1)

        for i in range(len(data)):
            for j in range(i + 1, min(i + 10, len(data))):
                if not (data.iloc[i] == data.iloc[j]).all():
                    pairs.append(np.concatenate([data.iloc[i].values, data.iloc[j].values]))
                    labels.append(0)

        if len(pairs) == 0:
            return None, None

        return np.array(pairs), np.array(labels)

    def train_model(self, X, y):
        """
        Train classification model
        """
        if X is None or y is None or len(X) < 2:
            return 0.0

        if len(np.unique(y)) < 2:
            return 0.0

        X_scaled = self.scaler.fit_transform(X)
        n_components = min(20, X.shape[1], X.shape[0])
        self.pca = PCA(n_components=n_components)
        X_pca = self.pca.fit_transform(X_scaled)

        test_size = min(0.2, max(0.1, 2 / len(X)))
        X_train, X_test, y_train, y_test = train_test_split(
            X_pca, y, test_size=test_size, random_state=42, stratify=y if len(np.unique(y)) > 1 else None
        )

        if self.method == 'logistic_regression':
            self.model = LogisticRegression(random_state=42, max_iter=1000)
        elif self.method == 'svm':
            self.model = SVC(random_state=42, probability=True)
        elif self.method == 'decision_tree':
            self.model = DecisionTreeClassifier(random_state=42)
        else:
            self.model = RandomForestClassifier(random_state=42, n_estimators=100)

        self.model.fit(X_train, y_train)
        self.is_trained = True

        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        return accuracy

    def find_duplicates(self, df, numeric_columns=None, threshold=0.5):
        """
        Find duplicates using trained model
        """
        if not self.is_trained or self.model is None:
            return []

        if numeric_columns is None:
            numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

        if not numeric_columns or len(df) < 2:
            return []

        data = df[numeric_columns].fillna(0)
        pairs = []
        indices = []

        for i in range(len(data)):
            for j in range(i + 1, len(data)):
                pairs.append(np.concatenate([data.iloc[i].values, data.iloc[j].values]))
                indices.append((i, j))

        if not pairs:
            return []

        try:
            pairs_scaled = self.scaler.transform(pairs)
            pairs_pca = self.pca.transform(pairs_scaled)
            probabilities = self.model.predict_proba(pairs_pca)[:, 1]
        except Exception:
            return []

        duplicate_groups = []
        processed = set()

        for (i, j), prob in zip(indices, probabilities):
            if prob > threshold and i not in processed and j not in processed:
                found_group = False
                for group in duplicate_groups:
                    if i in group:
                        group.append(j)
                        processed.add(j)
                        found_group = True
                        break
                    elif j in group:
                        group.append(i)
                        processed.add(i)
                        found_group = True
                        break
                if not found_group:
                    duplicate_groups.append([i, j])
                    processed.add(i)
                    processed.add(j)

        return duplicate_groups

    def compare_models(self, df, numeric_columns=None):
        """
        Compare different classification models
        """
        if numeric_columns is None:
            numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

        X, y = self.create_training_data(df, numeric_columns)
        if X is None or y is None:
            return {}

        models = {
            'Logistic Regression': 'logistic_regression',
            'SVM': 'svm',
            'Decision Tree': 'decision_tree',
            'Random Forest': 'random_forest'
        }

        results = {}
        for name, method in models.items():
            self.method = method
            accuracy = self.train_model(X, y)
            results[name] = accuracy

        return results

    def plot_model_comparison(self, results):
        """
        Plot model comparison results
        """
        if not results:
            return None

        fig, ax = plt.subplots(figsize=(10, 6))
        bars = ax.bar(results.keys(), results.values(), color=['#3498db', '#e74c3c', '#2ecc71', '#9b59b6'])
        ax.set_title('Model Comparison for Duplicate Detection')
        ax.set_ylabel('Accuracy')
        ax.set_ylim(0, 1)
        ax.set_xticklabels(results.keys(), rotation=45, ha='right')
        ax.grid(True, axis='y', alpha=0.3)

        for bar, val in zip(bars, results.values()):
            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02,
                    f'{val:.2f}', ha='center', va='bottom', fontsize=10)

        plt.tight_layout()
        return fig


class CombinedDuplicateRemover:
    def __init__(self, similarity_threshold=0.9):
        self.similarity_threshold = similarity_threshold
        self.text_remover = TextDuplicateRemover(n_clusters=10, similarity_threshold=similarity_threshold)
        self.tabular_remover = TabularDuplicateRemover(similarity_threshold=similarity_threshold)
        self.classification_detector = ClassificationDuplicateDetector()

    def analyze_dataset(self, df, text_columns=None, numeric_columns=None):
        """
        Analyze entire dataset for duplicates
        """
        analysis = {
            'total_rows': len(df),
            'exact_duplicates': 0,
            'text_duplicates': 0,
            'near_duplicates_kmeans': 0,
            'near_duplicates_knn': 0,
            'classification_duplicates': 0,
            'duplicate_details': {}
        }

        if df.empty:
            return analysis

        exact_groups = self.tabular_remover.find_exact_duplicates(df)
        analysis['exact_duplicates'] = len(exact_groups)
        analysis['duplicate_details']['exact'] = exact_groups

        if text_columns:
            for col in text_columns:
                if col in df.columns:
                    self.text_remover.n_clusters = min(10, len(df))
                    text_groups = self.text_remover.find_duplicates(df[col].tolist())
                    analysis['text_duplicates'] += len(text_groups)
                    analysis['duplicate_details'][f'text_{col}'] = text_groups

        if numeric_columns:
            near_groups_kmeans = self.tabular_remover.find_near_duplicates_kmeans(df, numeric_columns)
            analysis['near_duplicates_kmeans'] = len(near_groups_kmeans)
            analysis['duplicate_details']['near_kmeans'] = near_groups_kmeans

            near_groups_knn = self.tabular_remover.find_near_duplicates_knn(df, numeric_columns)
            analysis['near_duplicates_knn'] = len(near_groups_knn)
            analysis['duplicate_details']['near_knn'] = near_groups_knn

            training_data = self.classification_detector.create_training_data(df, numeric_columns)
            if training_data[0] is not None:
                self.classification_detector.train_model(*training_data)
                class_groups = self.classification_detector.find_duplicates(df, numeric_columns)
                analysis['classification_duplicates'] = len(class_groups)
                analysis['duplicate_details']['classification'] = class_groups

        return analysis

    def clean_dataset(self, df, text_columns=None, numeric_columns=None, method='kmeans'):
        """
        Clean dataset by removing duplicates
        """
        if df.empty:
            return df, 0

        cleaned_df = df.copy()
        removed_count = 0

        exact_groups = self.tabular_remover.find_exact_duplicates(cleaned_df)
        indices_to_remove = []
        for group in exact_groups:
            indices_to_remove.extend(group[1:])

        if indices_to_remove:
            cleaned_df = cleaned_df.drop(indices_to_remove).reset_index(drop=True)
            removed_count += len(indices_to_remove)

        if numeric_columns and method and len(cleaned_df) > 1:
            if method == 'kmeans':
                near_groups = self.tabular_remover.find_near_duplicates_kmeans(cleaned_df, numeric_columns)
            elif method == 'knn':
                near_groups = self.tabular_remover.find_near_duplicates_knn(cleaned_df, numeric_columns)
            elif method == 'classification':
                training_data = self.classification_detector.create_training_data(cleaned_df, numeric_columns)
                if training_data[0] is not None:
                    self.classification_detector.train_model(*training_data)
                    near_groups = self.classification_detector.find_duplicates(cleaned_df, numeric_columns)
                else:
                    near_groups = []
            else:
                near_groups = []

            indices_to_remove = []
            for group in near_groups:
                indices_to_remove.extend(group[1:])

            if indices_to_remove:
                cleaned_df = cleaned_df.drop(indices_to_remove).reset_index(drop=True)
                removed_count += len(indices_to_remove)

        if text_columns and len(cleaned_df) > 1:
            for col in text_columns:
                if col in cleaned_df.columns:
                    self.text_remover.n_clusters = min(10, len(cleaned_df))
                    text_groups = self.text_remover.find_duplicates(cleaned_df[col].tolist())
                    indices_to_remove = []
                    for group in text_groups:
                        indices_to_remove.extend(group[1:])

                    if indices_to_remove:
                        cleaned_df = cleaned_df.drop(indices_to_remove).reset_index(drop=True)
                        removed_count += len(indices_to_remove)

        return cleaned_df, removed_count

    def generate_report(self, analysis):
        """
        Generate a report of duplicate analysis
        """
        total_duplicates = sum([
            analysis['exact_duplicates'],
            analysis['text_duplicates'],
            analysis['near_duplicates_kmeans'],
            analysis['near_duplicates_knn'],
            analysis['classification_duplicates']
        ])

        report = f"""
üìä Dataset Duplicate Analysis Report
=====================================

Total Rows: {analysis['total_rows']}

üîÑ Duplicate Types Found:
‚Ä¢ Exact Duplicates: {analysis['exact_duplicates']} groups
‚Ä¢ Text Duplicates: {analysis['text_duplicates']} groups
‚Ä¢ Near Duplicates (K-means): {analysis['near_duplicates_kmeans']} groups
‚Ä¢ Near Duplicates (KNN): {analysis['near_duplicates_knn']} groups
‚Ä¢ Classification Duplicates: {analysis['classification_duplicates']} groups

üìà Summary:
Total Duplicate Groups Found: {total_duplicates}
        """
        return report


def fig_to_base64(fig):
    """Convert matplotlib figure to base64 string"""
    if fig is None:
        return None
    buf = io.BytesIO()
    fig.savefig(buf, format='png', bbox_inches='tight', dpi=100)
    buf.seek(0)
    img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    buf.close()
    plt.close(fig)
    return img_base64

Writing duplicate_remover_core.py


In [None]:
# =============================================================================
# CELL 4: Create Streamlit Application
# =============================================================================
%%writefile streamlit_app.py

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import sys

# Import core functionality
from duplicate_remover_core import (
    TextDuplicateRemover,
    TabularDuplicateRemover,
    ClassificationDuplicateDetector,
    CombinedDuplicateRemover
)

# Page configuration
st.set_page_config(
    page_title="üîç Duplicate Remover Tool",
    page_icon="üîç",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        font-weight: bold;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .sub-header {
        font-size: 1.5rem;
        color: #2c3e50;
        margin-top: 1.5rem;
        margin-bottom: 1rem;
    }
    .metric-card {
        background-color: #f8f9fa;
        border-radius: 10px;
        padding: 1rem;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
    }
    .success-box {
        background-color: #d4edda;
        border: 1px solid #c3e6cb;
        border-radius: 5px;
        padding: 1rem;
        color: #155724;
    }
    .warning-box {
        background-color: #fff3cd;
        border: 1px solid #ffeeba;
        border-radius: 5px;
        padding: 1rem;
        color: #856404;
    }
    .info-box {
        background-color: #d1ecf1;
        border: 1px solid #bee5eb;
        border-radius: 5px;
        padding: 1rem;
        color: #0c5460;
    }
</style>
""", unsafe_allow_html=True)


def main():
    st.markdown('<h1 class="main-header">üîç Duplicate Remover Tool</h1>', unsafe_allow_html=True)
    st.markdown("---")

    # Sidebar
    with st.sidebar:
        st.header("‚öôÔ∏è Settings")

        similarity_threshold = st.slider(
            "Similarity Threshold",
            min_value=0.5,
            max_value=1.0,
            value=0.9,
            step=0.05,
            help="Higher values mean stricter duplicate detection"
        )

        n_clusters = st.slider(
            "Number of Clusters (for K-means)",
            min_value=2,
            max_value=20,
            value=10,
            help="Number of clusters for K-means algorithm"
        )

        detection_method = st.selectbox(
            "Detection Method",
            options=["K-means", "KNN", "Classification", "All Methods"],
            help="Select the duplicate detection method"
        )

        st.markdown("---")
        st.header("üìñ About")
        st.info("""
        This tool helps you find and remove duplicate data using various machine learning techniques:

        - **Exact Duplicates**: Identical rows
        - **Text Duplicates**: Similar text content
        - **Near Duplicates (K-means)**: Clustering-based detection
        - **Near Duplicates (KNN)**: Nearest neighbor detection
        - **Classification**: ML-based detection
        """)

    # Main content tabs
    tab1, tab2, tab3, tab4 = st.tabs(["üì§ Upload Data", "üîç Analyze", "üßπ Clean Data", "üìä Visualize"])

    # Session state initialization
    if 'df' not in st.session_state:
        st.session_state.df = None
    if 'analysis' not in st.session_state:
        st.session_state.analysis = None
    if 'cleaned_df' not in st.session_state:
        st.session_state.cleaned_df = None

    # Tab 1: Upload Data
    with tab1:
        st.markdown('<h2 class="sub-header">üì§ Upload Your Dataset</h2>', unsafe_allow_html=True)

        col1, col2 = st.columns([2, 1])

        with col1:
            uploaded_file = st.file_uploader(
                "Choose a CSV file",
                type=['csv'],
                help="Upload a CSV file to analyze for duplicates"
            )

            if uploaded_file is not None:
                try:
                    df = pd.read_csv(uploaded_file)
                    st.session_state.df = df
                    st.success(f"‚úÖ Successfully loaded {len(df)} rows and {len(df.columns)} columns!")
                except Exception as e:
                    st.error(f"‚ùå Error loading file: {str(e)}")

        with col2:
            st.markdown("### Or use sample data")
            if st.button("üìä Load Sample Data", use_container_width=True):
                sample_data = {
                    'Name': ['John', 'Jane', 'John', 'Bob', 'Jane', 'Alice', 'Charlie', 'John'],
                    'Age': [25, 30, 25, 35, 30, 28, 40, 25],
                    'City': ['NYC', 'LA', 'NYC', 'Chicago', 'LA', 'Boston', 'Seattle', 'NYC'],
                    'Salary': [50000, 60000, 50000, 70000, 60000, 55000, 80000, 50000],
                    'Department': ['IT', 'HR', 'IT', 'Finance', 'HR', 'Marketing', 'IT', 'IT']
                }
                st.session_state.df = pd.DataFrame(sample_data)
                st.success("‚úÖ Sample data loaded!")

        if st.session_state.df is not None:
            st.markdown("### üìã Data Preview")
            st.dataframe(st.session_state.df.head(20), use_container_width=True)

            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Total Rows", len(st.session_state.df))
            with col2:
                st.metric("Total Columns", len(st.session_state.df.columns))
            with col3:
                st.metric("Memory Usage", f"{st.session_state.df.memory_usage(deep=True).sum() / 1024:.2f} KB")

            st.markdown("### üìä Column Information")
            col_info = pd.DataFrame({
                'Column': st.session_state.df.columns,
                'Type': st.session_state.df.dtypes.astype(str),
                'Non-Null Count': st.session_state.df.count().values,
                'Unique Values': st.session_state.df.nunique().values
            })
            st.dataframe(col_info, use_container_width=True)

    # Tab 2: Analyze
    with tab2:
        st.markdown('<h2 class="sub-header">üîç Analyze Duplicates</h2>', unsafe_allow_html=True)

        if st.session_state.df is None:
            st.warning("‚ö†Ô∏è Please upload a dataset first!")
        else:
            df = st.session_state.df

            col1, col2 = st.columns(2)

            with col1:
                text_columns = st.multiselect(
                    "Select Text Columns",
                    options=df.select_dtypes(include=['object']).columns.tolist(),
                    help="Select columns containing text data"
                )

            with col2:
                numeric_columns = st.multiselect(
                    "Select Numeric Columns",
                    options=df.select_dtypes(include=[np.number]).columns.tolist(),
                    default=df.select_dtypes(include=[np.number]).columns.tolist(),
                    help="Select columns containing numeric data"
                )

            if st.button("üîç Analyze Dataset", type="primary", use_container_width=True):
                with st.spinner("Analyzing dataset for duplicates..."):
                    remover = CombinedDuplicateRemover(similarity_threshold=similarity_threshold)
                    remover.text_remover.n_clusters = min(n_clusters, len(df))

                    analysis = remover.analyze_dataset(
                        df,
                        text_columns=text_columns if text_columns else None,
                        numeric_columns=numeric_columns if numeric_columns else None
                    )
                    st.session_state.analysis = analysis

                    st.success("‚úÖ Analysis complete!")

            if st.session_state.analysis is not None:
                analysis = st.session_state.analysis

                st.markdown("### üìä Analysis Results")

                col1, col2, col3, col4, col5 = st.columns(5)
                with col1:
                    st.metric("Exact Duplicates", analysis['exact_duplicates'])
                with col2:
                    st.metric("Text Duplicates", analysis['text_duplicates'])
                with col3:
                    st.metric("K-means Duplicates", analysis['near_duplicates_kmeans'])
                with col4:
                    st.metric("KNN Duplicates", analysis['near_duplicates_knn'])
                with col5:
                    st.metric("Classification Duplicates", analysis['classification_duplicates'])

                remover = CombinedDuplicateRemover()
                st.markdown("### üìù Detailed Report")
                st.text(remover.generate_report(analysis))

                if analysis['duplicate_details'].get('exact'):
                    st.markdown("### üîÑ Exact Duplicate Groups")
                    for i, group in enumerate(analysis['duplicate_details']['exact'][:5]):
                        with st.expander(f"Group {i + 1} ({len(group)} rows)"):
                            st.dataframe(df.iloc[group], use_container_width=True)

    # Tab 3: Clean Data
    with tab3:
        st.markdown('<h2 class="sub-header">üßπ Clean Dataset</h2>', unsafe_allow_html=True)

        if st.session_state.df is None:
            st.warning("‚ö†Ô∏è Please upload a dataset first!")
        else:
            df = st.session_state.df

            col1, col2 = st.columns(2)

            with col1:
                clean_text_columns = st.multiselect(
                    "Text Columns to Clean",
                    options=df.select_dtypes(include=['object']).columns.tolist(),
                    key="clean_text"
                )

            with col2:
                clean_numeric_columns = st.multiselect(
                    "Numeric Columns to Clean",
                    options=df.select_dtypes(include=[np.number]).columns.tolist(),
                    default=df.select_dtypes(include=[np.number]).columns.tolist(),
                    key="clean_numeric"
                )

            method_map = {
                "K-means": "kmeans",
                "KNN": "knn",
                "Classification": "classification"
            }
            clean_method = method_map.get(detection_method, "kmeans")

            if st.button("üßπ Clean Dataset", type="primary", use_container_width=True):
                with st.spinner("Cleaning dataset..."):
                    remover = CombinedDuplicateRemover(similarity_threshold=similarity_threshold)
                    remover.text_remover.n_clusters = min(n_clusters, len(df))

                    cleaned_df, removed_count = remover.clean_dataset(
                        df,
                        text_columns=clean_text_columns if clean_text_columns else None,
                        numeric_columns=clean_numeric_columns if clean_numeric_columns else None,
                        method=clean_method
                    )
                    st.session_state.cleaned_df = cleaned_df

                    st.success(f"‚úÖ Cleaning complete! Removed {removed_count} duplicate entries.")

            if st.session_state.cleaned_df is not None:
                cleaned_df = st.session_state.cleaned_df

                col1, col2 = st.columns(2)
                with col1:
                    st.metric("Original Rows", len(df))
                with col2:
                    st.metric("Cleaned Rows", len(cleaned_df))

                st.markdown("### üìã Cleaned Data Preview")
                st.dataframe(cleaned_df.head(20), use_container_width=True)

                csv = cleaned_df.to_csv(index=False)
                st.download_button(
                    label="üì• Download Cleaned Dataset",
                    data=csv,
                    file_name="cleaned_dataset.csv",
                    mime="text/csv",
                    use_container_width=True
                )

    # Tab 4: Visualize
    with tab4:
        st.markdown('<h2 class="sub-header">üìä Visualizations</h2>', unsafe_allow_html=True)

        if st.session_state.df is None:
            st.warning("‚ö†Ô∏è Please upload a dataset first!")
        else:
            df = st.session_state.df
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

            if len(numeric_cols) >= 2:
                col1, col2 = st.columns(2)

                with col1:
                    if st.button("üìä Show Cluster Visualization", use_container_width=True):
                        with st.spinner("Generating cluster visualization..."):
                            tabular_remover = TabularDuplicateRemover(similarity_threshold=similarity_threshold)
                            fig = tabular_remover.visualize_clusters(df, numeric_cols)
                            if fig:
                                st.pyplot(fig)
                                plt.close(fig)
                            else:
                                st.warning("Could not generate visualization with current data.")

                with col2:
                    if st.button("üìà Compare ML Models", use_container_width=True):
                        with st.spinner("Comparing models..."):
                            detector = ClassificationDuplicateDetector()
                            results = detector.compare_models(df, numeric_cols)
                            if results:
                                fig = detector.plot_model_comparison(results)
                                if fig:
                                    st.pyplot(fig)
                                    plt.close(fig)

                                st.markdown("### Model Accuracy Results")
                                results_df = pd.DataFrame({
                                    'Model': results.keys(),
                                    'Accuracy': results.values()
                                })
                                st.dataframe(results_df, use_container_width=True)
                            else:
                                st.warning("Could not compare models with current data.")

                st.markdown("### üìä Data Distribution")
                selected_col = st.selectbox("Select column to visualize", numeric_cols)
                fig, axes = plt.subplots(1, 2, figsize=(12, 4))

                axes[0].hist(df[selected_col].dropna(), bins=30, edgecolor='black', alpha=0.7)
                axes[0].set_title(f'Distribution of {selected_col}')
                axes[0].set_xlabel(selected_col)
                axes[0].set_ylabel('Frequency')

                axes[1].boxplot(df[selected_col].dropna())
                axes[1].set_title(f'Box Plot of {selected_col}')
                axes[1].set_ylabel(selected_col)

                plt.tight_layout()
                st.pyplot(fig)
                plt.close(fig)
            else:
                st.warning("Need at least 2 numeric columns for visualizations.")


if __name__ == "__main__":
    main()

Writing streamlit_app.py


In [None]:
# =============================================================================
# CELL 5: Create Flask Application
# =============================================================================
%%writefile flask_app.py

from flask import Flask, render_template_string, request, jsonify, send_file
import pandas as pd
import numpy as np
import io
import base64
import json
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from duplicate_remover_core import (
    TextDuplicateRemover,
    TabularDuplicateRemover,
    ClassificationDuplicateDetector,
    CombinedDuplicateRemover,
    fig_to_base64
)

app = Flask(__name__)

# Store data in memory
app_data = {
    'df': None,
    'analysis': None,
    'cleaned_df': None
}

HTML_TEMPLATE = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>üîç Duplicate Remover Tool</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
    <style>
        :root {
            --primary-color: #3498db;
            --secondary-color: #2ecc71;
            --accent-color: #9b59b6;
            --background-color: #f8f9fa;
            --card-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
        }
        body {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        }
        .main-container {
            background: rgba(255, 255, 255, 0.95);
            border-radius: 20px;
            box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
            margin: 20px auto;
            max-width: 1400px;
            padding: 30px;
        }
        .header {
            text-align: center;
            margin-bottom: 30px;
            padding: 20px;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            border-radius: 15px;
            color: white;
        }
        .header h1 {
            font-size: 2.5rem;
            font-weight: bold;
            margin-bottom: 10px;
        }
        .header p {
            font-size: 1.1rem;
            opacity: 0.9;
        }
        .card {
            border: none;
            border-radius: 15px;
            box-shadow: var(--card-shadow);
            margin-bottom: 20px;
            transition: transform 0.3s ease, box-shadow 0.3s ease;
        }
        .card:hover {
            transform: translateY(-5px);
            box-shadow: 0 10px 30px rgba(0, 0, 0, 0.15);
        }
        .card-header {
            background: linear-gradient(135deg, var(--primary-color), var(--accent-color));
            color: white;
            border-radius: 15px 15px 0 0 !important;
            font-weight: bold;
            padding: 15px 20px;
        }
        .metric-card {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            border-radius: 15px;
            padding: 20px;
            text-align: center;
            margin-bottom: 15px;
        }
        .metric-card h3 {
            font-size: 2rem;
            font-weight: bold;
            margin-bottom: 5px;
        }
        .metric-card p {
            margin: 0;
            opacity: 0.9;
        }
        .btn-primary {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            border: none;
            border-radius: 10px;
            padding: 12px 30px;
            font-weight: bold;
            transition: transform 0.3s ease, box-shadow 0.3s ease;
        }
        .btn-primary:hover {
            transform: translateY(-2px);
            box-shadow: 0 5px 20px rgba(102, 126, 234, 0.4);
            background: linear-gradient(135deg, #764ba2 0%, #667eea 100%);
        }
        .btn-success {
            background: linear-gradient(135deg, #2ecc71 0%, #27ae60 100%);
            border: none;
            border-radius: 10px;
            padding: 12px 30px;
            font-weight: bold;
        }
        .nav-tabs {
            border: none;
            margin-bottom: 20px;
        }
        .nav-tabs .nav-link {
            border: none;
            border-radius: 10px;
            padding: 15px 25px;
            margin-right: 10px;
            background: #e9ecef;
            color: #495057;
            font-weight: 600;
            transition: all 0.3s ease;
        }
        .nav-tabs .nav-link.active {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
        }
        .nav-tabs .nav-link:hover:not(.active) {
            background: #dee2e6;
        }
        .table {
            border-radius: 10px;
            overflow: hidden;
        }
        .table thead {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
        }
        .table-striped tbody tr:nth-of-type(odd) {
            background-color: rgba(102, 126, 234, 0.05);
        }
        .form-control, .form-select {
            border-radius: 10px;
            border: 2px solid #e9ecef;
            padding: 12px 15px;
            transition: border-color 0.3s ease, box-shadow 0.3s ease;
        }
        .form-control:focus, .form-select:focus {
            border-color: #667eea;
            box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.2);
        }
        .alert {
            border-radius: 10px;
            border: none;
        }
        .alert-success {
            background: linear-gradient(135deg, rgba(46, 204, 113, 0.2), rgba(39, 174, 96, 0.2));
            color: #155724;
        }
        .alert-warning {
            background: linear-gradient(135deg, rgba(241, 196, 15, 0.2), rgba(243, 156, 18, 0.2));
            color: #856404;
        }
        .alert-info {
            background: linear-gradient(135deg, rgba(52, 152, 219, 0.2), rgba(41, 128, 185, 0.2));
            color: #0c5460;
        }
        .spinner-border {
            width: 3rem;
            height: 3rem;
        }
        .loading-overlay {
            position: fixed;
            top: 0;
            left: 0;
            width: 100%;
            height: 100%;
            background: rgba(255, 255, 255, 0.9);
            display: none;
            justify-content: center;
            align-items: center;
            z-index: 9999;
            flex-direction: column;
        }
        .loading-overlay.show {
            display: flex;
        }
        .file-upload {
            border: 3px dashed #667eea;
            border-radius: 15px;
            padding: 40px;
            text-align: center;
            transition: all 0.3s ease;
            cursor: pointer;
        }
        .file-upload:hover {
            border-color: #764ba2;
            background: rgba(102, 126, 234, 0.05);
        }
        .file-upload i {
            font-size: 3rem;
            color: #667eea;
            margin-bottom: 15px;
        }
        .visualization-container {
            background: white;
            border-radius: 15px;
            padding: 20px;
            box-shadow: var(--card-shadow);
        }
        .visualization-container img {
            max-width: 100%;
            border-radius: 10px;
        }
        .report-box {
            background: #f8f9fa;
            border-radius: 15px;
            padding: 20px;
            font-family: 'Courier New', monospace;
            white-space: pre-wrap;
            border-left: 5px solid #667eea;
        }
        .settings-panel {
            background: linear-gradient(135deg, rgba(102, 126, 234, 0.1), rgba(118, 75, 162, 0.1));
            border-radius: 15px;
            padding: 20px;
            margin-bottom: 20px;
        }
        .footer {
            text-align: center;
            padding: 20px;
            color: #6c757d;
            margin-top: 30px;
        }
    </style>
</head>
<body>
    <div class="loading-overlay" id="loadingOverlay">
        <div class="spinner-border text-primary" role="status"></div>
        <p class="mt-3 text-primary fw-bold">Processing...</p>
    </div>

    <div class="main-container">
        <div class="header">
            <h1><i class="fas fa-search"></i> Duplicate Remover Tool</h1>
            <p>Find and remove duplicate data using Machine Learning techniques</p>
        </div>

        <ul class="nav nav-tabs" id="mainTabs" role="tablist">
            <li class="nav-item" role="presentation">
                <button class="nav-link active" id="upload-tab" data-bs-toggle="tab" data-bs-target="#upload" type="button" role="tab">
                    <i class="fas fa-upload me-2"></i>Upload Data
                </button>
            </li>
            <li class="nav-item" role="presentation">
                <button class="nav-link" id="analyze-tab" data-bs-toggle="tab" data-bs-target="#analyze" type="button" role="tab">
                    <i class="fas fa-search me-2"></i>Analyze
                </button>
            </li>
            <li class="nav-item" role="presentation">
                <button class="nav-link" id="clean-tab" data-bs-toggle="tab" data-bs-target="#clean" type="button" role="tab">
                    <i class="fas fa-broom me-2"></i>Clean Data
                </button>
            </li>
            <li class="nav-item" role="presentation">
                <button class="nav-link" id="visualize-tab" data-bs-toggle="tab" data-bs-target="#visualize" type="button" role="tab">
                    <i class="fas fa-chart-bar me-2"></i>Visualize
                </button>
            </li>
        </ul>

        <div class="tab-content" id="mainTabsContent">
            <!-- Upload Tab -->
            <div class="tab-pane fade show active" id="upload" role="tabpanel">
                <div class="row">
                    <div class="col-md-8">
                        <div class="card">
                            <div class="card-header">
                                <i class="fas fa-file-csv me-2"></i>Upload Dataset
                            </div>
                            <div class="card-body">
                                <form id="uploadForm" enctype="multipart/form-data">
                                    <div class="file-upload" onclick="document.getElementById('fileInput').click()">
                                        <i class="fas fa-cloud-upload-alt"></i>
                                        <h4>Drop your CSV file here</h4>
                                        <p class="text-muted">or click to browse</p>
                                        <input type="file" id="fileInput" name="file" accept=".csv" style="display: none;">
                                    </div>
                                    <div id="fileName" class="mt-3 text-center"></div>
                                    <button type="submit" class="btn btn-primary w-100 mt-3">
                                        <i class="fas fa-upload me-2"></i>Upload File
                                    </button>
                                </form>
                            </div>
                        </div>
                    </div>
                    <div class="col-md-4">
                        <div class="card">
                            <div class="card-header">
                                <i class="fas fa-database me-2"></i>Sample Data
                            </div>
                            <div class="card-body">
                                <p>Don't have a dataset? Try our sample data!</p>
                                <button class="btn btn-success w-100" onclick="loadSampleData()">
                                    <i class="fas fa-table me-2"></i>Load Sample Data
                                </button>
                            </div>
                        </div>
                        <div class="card">
                            <div class="card-header">
                                <i class="fas fa-info-circle me-2"></i>Info
                            </div>
                            <div class="card-body">
                                <div id="dataInfo">
                                    <p class="text-muted">No data loaded yet.</p>
                                </div>
                            </div>
                        </div>
                    </div>
                </div>
                <div id="dataPreview" class="mt-4" style="display: none;">
                    <div class="card">
                        <div class="card-header">
                            <i class="fas fa-table me-2"></i>Data Preview
                        </div>
                        <div class="card-body">
                            <div class="table-responsive" id="previewTable"></div>
                        </div>
                    </div>
                </div>
            </div>

            <!-- Analyze Tab -->
            <div class="tab-pane fade" id="analyze" role="tabpanel">
                <div class="settings-panel">
                    <h5><i class="fas fa-cog me-2"></i>Analysis Settings</h5>
                    <div class="row">
                        <div class="col-md-3">
                            <label class="form-label">Similarity Threshold</label>
                            <input type="range" class="form-range" id="similarityThreshold" min="0.5" max="1" step="0.05" value="0.9">
                            <span id="thresholdValue">0.9</span>
                        </div>
                        <div class="col-md-3">
                            <label class="form-label">Number of Clusters</label>
                            <input type="number" class="form-control" id="nClusters" min="2" max="20" value="10">
                        </div>
                        <div class="col-md-3">
                            <label class="form-label">Text Columns</label>
                            <select class="form-select" id="textColumns" multiple></select>
                        </div>
                        <div class="col-md-3">
                            <label class="form-label">Numeric Columns</label>
                            <select class="form-select" id="numericColumns" multiple></select>
                        </div>
                    </div>
                </div>
                <button class="btn btn-primary mb-4" onclick="analyzeData()">
                    <i class="fas fa-search me-2"></i>Analyze Dataset
                </button>
                <div id="analysisResults"></div>
            </div>

            <!-- Clean Tab -->
            <div class="tab-pane fade" id="clean" role="tabpanel">
                <div class="settings-panel">
                    <h5><i class="fas fa-cog me-2"></i>Cleaning Settings</h5>
                    <div class="row">
                        <div class="col-md-4">
                            <label class="form-label">Detection Method</label>
                            <select class="form-select" id="cleanMethod">
                                <option value="kmeans">K-means Clustering</option>
                                <option value="knn">K-Nearest Neighbors</option>
                                <option value="classification">Classification</option>
                            </select>
                        </div>
                        <div class="col-md-4">
                            <label class="form-label">Text Columns to Clean</label>
                            <select class="form-select" id="cleanTextColumns" multiple></select>
                        </div>
                        <div class="col-md-4">
                            <label class="form-label">Numeric Columns to Clean</label>
                            <select class="form-select" id="cleanNumericColumns" multiple></select>
                        </div>
                    </div>
                </div>
                <button class="btn btn-success mb-4" onclick="cleanData()">
                    <i class="fas fa-broom me-2"></i>Clean Dataset
                </button>
                <div id="cleanResults"></div>
            </div>

            <!-- Visualize Tab -->
            <div class="tab-pane fade" id="visualize" role="tabpanel">
                <div class="row">
                    <div class="col-md-6">
                        <button class="btn btn-primary w-100 mb-3" onclick="showClusterVisualization()">
                            <i class="fas fa-project-diagram me-2"></i>Show Cluster Visualization
                        </button>
                        <div id="clusterViz" class="visualization-container"></div>
                    </div>
                    <div class="col-md-6">
                        <button class="btn btn-primary w-100 mb-3" onclick="compareModels()">
                            <i class="fas fa-chart-bar me-2"></i>Compare ML Models
                        </button>
                        <div id="modelComparison" class="visualization-container"></div>
                    </div>
                </div>
            </div>
        </div>

        <div class="footer">
            <p><i class="fas fa-code me-2"></i>Duplicate Remover Tool | Built with Flask & Machine Learning</p>
        </div>
    </div>

    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
    <script>
        // File input handling
        document.getElementById('fileInput').addEventListener('change', function(e) {
            var fileName = e.target.files[0] ? e.target.files[0].name : '';
            document.getElementById('fileName').innerHTML = fileName ?
                '<span class="badge bg-success"><i class="fas fa-check me-2"></i>' + fileName + '</span>' : '';
        });

        // Slider value display
        document.getElementById('similarityThreshold').addEventListener('input', function(e) {
            document.getElementById('thresholdValue').textContent = e.target.value;
        });

        // Show loading overlay
        function showLoading() {
            document.getElementById('loadingOverlay').classList.add('show');
        }

        // Hide loading overlay
        function hideLoading() {
            document.getElementById('loadingOverlay').classList.remove('show');
        }

        // Upload form handling
        document.getElementById('uploadForm').addEventListener('submit', function(e) {
            e.preventDefault();
            var formData = new FormData();
            var fileInput = document.getElementById('fileInput');

            if (!fileInput.files[0]) {
                alert('Please select a file first!');
                return;
            }

            formData.append('file', fileInput.files[0]);
            showLoading();

            fetch('/upload', {
                method: 'POST',
                body: formData
            })
            .then(response => response.json())
            .then(data => {
                hideLoading();
                if (data.success) {
                    updateDataInfo(data);
                    updateColumnSelects(data.columns);
                    showDataPreview(data.preview);
                } else {
                    alert('Error: ' + data.error);
                }
            })
            .catch(error => {
                hideLoading();
                alert('Error uploading file: ' + error);
            });
        });

        // Load sample data
        function loadSampleData() {
            showLoading();
            fetch('/load_sample')
            .then(response => response.json())
            .then(data => {
                hideLoading();
                if (data.success) {
                    updateDataInfo(data);
                    updateColumnSelects(data.columns);
                    showDataPreview(data.preview);
                } else {
                    alert('Error: ' + data.error);
                }
            })
            .catch(error => {
                hideLoading();
                alert('Error loading sample data: ' + error);
            });
        }

        // Update data info display
        function updateDataInfo(data) {
            document.getElementById('dataInfo').innerHTML = `
                <div class="metric-card mb-2">
                    <h3>${data.rows}</h3>
                    <p>Total Rows</p>
                </div>
                <div class="metric-card mb-2">
                    <h3>${data.cols}</h3>
                    <p>Total Columns</p>
                </div>
            `;
        }

        // Update column select dropdowns
        function updateColumnSelects(columns) {
            var textCols = columns.text || [];
            var numCols = columns.numeric || [];

            ['textColumns', 'cleanTextColumns'].forEach(id => {
                var select = document.getElementById(id);
                select.innerHTML = textCols.map(c => `<option value="${c}">${c}</option>`).join('');
            });

            ['numericColumns', 'cleanNumericColumns'].forEach(id => {
                var select = document.getElementById(id);
                select.innerHTML = numCols.map(c => `<option value="${c}" selected>${c}</option>`).join('');
            });
        }

        // Show data preview
        function showDataPreview(preview) {
            document.getElementById('dataPreview').style.display = 'block';
            document.getElementById('previewTable').innerHTML = preview;
        }

        // Analyze data
        function analyzeData() {
            showLoading();
            var params = {
                similarity_threshold: document.getElementById('similarityThreshold').value,
                n_clusters: document.getElementById('nClusters').value,
                text_columns: Array.from(document.getElementById('textColumns').selectedOptions).map(o => o.value),
                numeric_columns: Array.from(document.getElementById('numericColumns').selectedOptions).map(o => o.value)
            };

            fetch('/analyze', {
                method: 'POST',
                headers: {'Content-Type': 'application/json'},
                body: JSON.stringify(params)
            })
            .then(response => response.json())
            .then(data => {
                hideLoading();
                if (data.success) {
                    displayAnalysisResults(data);
                } else {
                    alert('Error: ' + data.error);
                }
            })
            .catch(error => {
                hideLoading();
                alert('Error analyzing data: ' + error);
            });
        }

        // Display analysis results
        function displayAnalysisResults(data) {
            var html = `
                <div class="row mb-4">
                    <div class="col">
                        <div class="metric-card">
                            <h3>${data.analysis.exact_duplicates}</h3>
                            <p>Exact Duplicates</p>
                        </div>
                    </div>
                    <div class="col">
                        <div class="metric-card">
                            <h3>${data.analysis.text_duplicates}</h3>
                            <p>Text Duplicates</p>
                        </div>
                    </div>
                    <div class="col">
                        <div class="metric-card">
                            <h3>${data.analysis.near_duplicates_kmeans}</h3>
                            <p>K-means Duplicates</p>
                        </div>
                    </div>
                    <div class="col">
                        <div class="metric-card">
                            <h3>${data.analysis.near_duplicates_knn}</h3>
                            <p>KNN Duplicates</p>
                        </div>
                    </div>
                    <div class="col">
                        <div class="metric-card">
                            <h3>${data.analysis.classification_duplicates}</h3>
                            <p>Classification Duplicates</p>
                        </div>
                    </div>
                </div>
                <div class="card">
                    <div class="card-header">
                        <i class="fas fa-file-alt me-2"></i>Detailed Report
                    </div>
                    <div class="card-body">
                        <div class="report-box">${data.report}</div>
                    </div>
                </div>
            `;
            document.getElementById('analysisResults').innerHTML = html;
        }

        // Clean data
        function cleanData() {
            showLoading();
            var params = {
                method: document.getElementById('cleanMethod').value,
                similarity_threshold: document.getElementById('similarityThreshold').value,
                text_columns: Array.from(document.getElementById('cleanTextColumns').selectedOptions).map(o => o.value),
                numeric_columns: Array.from(document.getElementById('cleanNumericColumns').selectedOptions).map(o => o.value)
            };

            fetch('/clean', {
                method: 'POST',
                headers: {'Content-Type': 'application/json'},
                body: JSON.stringify(params)
            })
            .then(response => response.json())
            .then(data => {
                hideLoading();
                if (data.success) {
                    displayCleanResults(data);
                } else {
                    alert('Error: ' + data.error);
                }
            })
            .catch(error => {
                hideLoading();
                alert('Error cleaning data: ' + error);
            });
        }

        // Display clean results
        function displayCleanResults(data) {
            var html = `
                <div class="alert alert-success">
                    <i class="fas fa-check-circle me-2"></i>
                    Successfully removed ${data.removed_count} duplicate entries!
                </div>
                <div class="row mb-4">
                    <div class="col-md-6">
                        <div class="metric-card">
                            <h3>${data.original_rows}</h3>
                            <p>Original Rows</p>
                        </div>
                    </div>
                    <div class="col-md-6">
                        <div class="metric-card">
                            <h3>${data.cleaned_rows}</h3>
                            <p>Cleaned Rows</p>
                        </div>
                    </div>
                </div>
                <div class="card">
                    <div class="card-header">
                        <i class="fas fa-table me-2"></i>Cleaned Data Preview
                    </div>
                    <div class="card-body">
                        <div class="table-responsive">${data.preview}</div>
                    </div>
                </div>
                <a href="/download" class="btn btn-success mt-3">
                    <i class="fas fa-download me-2"></i>Download Cleaned Dataset
                </a>
            `;
            document.getElementById('cleanResults').innerHTML = html;
        }

        // Show cluster visualization
        function showClusterVisualization() {
            showLoading();
            fetch('/visualize_clusters', {
                method: 'POST',
                headers: {'Content-Type': 'application/json'},
                body: JSON.stringify({
                    numeric_columns: Array.from(document.getElementById('numericColumns').selectedOptions).map(o => o.value)
                })
            })
            .then(response => response.json())
            .then(data => {
                hideLoading();
                if (data.success && data.image) {
                    document.getElementById('clusterViz').innerHTML =
                        '<img src="data:image/png;base64,' + data.image + '" alt="Cluster Visualization">';
                } else {
                    document.getElementById('clusterViz').innerHTML =
                        '<div class="alert alert-warning">' + (data.error || 'Could not generate visualization') + '</div>';
                }
            })
            .catch(error => {
                hideLoading();
                alert('Error: ' + error);
            });
        }

        // Compare models
        function compareModels() {
            showLoading();
            fetch('/compare_models', {
                method: 'POST',
                headers: {'Content-Type': 'application/json'},
                body: JSON.stringify({
                    numeric_columns: Array.from(document.getElementById('numericColumns').selectedOptions).map(o => o.value)
                })
            })
            .then(response => response.json())
            .then(data => {
                hideLoading();
                if (data.success && data.image) {
                    var html = '<img src="data:image/png;base64,' + data.image + '" alt="Model Comparison">';
                    if (data.results) {
                        html += '<table class="table mt-3"><thead><tr><th>Model</th><th>Accuracy</th></tr></thead><tbody>';
                        for (var model in data.results) {
                            html += '<tr><td>' + model + '</td><td>' + data.results[model].toFixed(4) + '</td></tr>';
                        }
                        html += '</tbody></table>';
                    }
                    document.getElementById('modelComparison').innerHTML = html;
                } else {
                    document.getElementById('modelComparison').innerHTML =
                        '<div class="alert alert-warning">' + (data.error || 'Could not compare models') + '</div>';
                }
            })
            .catch(error => {
                hideLoading();
                alert('Error: ' + error);
            });
        }
    </script>
</body>
</html>
'''


@app.route('/')
def index():
    return render_template_string(HTML_TEMPLATE)


@app.route('/upload', methods=['POST'])
def upload_file():
    try:
        file = request.files['file']
        if file and file.filename.endswith('.csv'):
            df = pd.read_csv(file)
            app_data['df'] = df

            preview_html = df.head(10).to_html(classes='table table-striped table-hover', index=False)

            return jsonify({
                'success': True,
                'rows': len(df),
                'cols': len(df.columns),
                'columns': {
                    'text': df.select_dtypes(include=['object']).columns.tolist(),
                    'numeric': df.select_dtypes(include=[np.number]).columns.tolist()
                },
                'preview': preview_html
            })
        return jsonify({'success': False, 'error': 'Invalid file format'})
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)})


@app.route('/load_sample', methods=['GET'])
def load_sample():
    try:
        sample_data = {
            'Name': ['John', 'Jane', 'John', 'Bob', 'Jane', 'Alice', 'Charlie', 'John'],
            'Age': [25, 30, 25, 35, 30, 28, 40, 25],
            'City': ['NYC', 'LA', 'NYC', 'Chicago', 'LA', 'Boston', 'Seattle', 'NYC'],
            'Salary': [50000, 60000, 50000, 70000, 60000, 55000, 80000, 50000],
            'Department': ['IT', 'HR', 'IT', 'Finance', 'HR', 'Marketing', 'IT', 'IT']
        }
        df = pd.DataFrame(sample_data)
        app_data['df'] = df

        preview_html = df.to_html(classes='table table-striped table-hover', index=False)

        return jsonify({
            'success': True,
            'rows': len(df),
            'cols': len(df.columns),
            'columns': {
                'text': df.select_dtypes(include=['object']).columns.tolist(),
                'numeric': df.select_dtypes(include=[np.number]).columns.tolist()
            },
            'preview': preview_html
        })
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)})


@app.route('/analyze', methods=['POST'])
def analyze():
    try:
        if app_data['df'] is None:
            return jsonify({'success': False, 'error': 'No data loaded'})

        params = request.json
        df = app_data['df']

        remover = CombinedDuplicateRemover(
            similarity_threshold=float(params.get('similarity_threshold', 0.9))
        )
        remover.text_remover.n_clusters = min(
            int(params.get('n_clusters', 10)),
            len(df)
        )

        text_cols = params.get('text_columns', [])
        numeric_cols = params.get('numeric_columns', [])

        analysis = remover.analyze_dataset(
            df,
            text_columns=text_cols if text_cols else None,
            numeric_columns=numeric_cols if numeric_cols else None
        )
        app_data['analysis'] = analysis

        report = remover.generate_report(analysis)

        return jsonify({
            'success': True,
            'analysis': {
                'total_rows': analysis['total_rows'],
                'exact_duplicates': analysis['exact_duplicates'],
                'text_duplicates': analysis['text_duplicates'],
                'near_duplicates_kmeans': analysis['near_duplicates_kmeans'],
                'near_duplicates_knn': analysis['near_duplicates_knn'],
                'classification_duplicates': analysis['classification_duplicates']
            },
            'report': report
        })
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)})


@app.route('/clean', methods=['POST'])
def clean():
    try:
        if app_data['df'] is None:
            return jsonify({'success': False, 'error': 'No data loaded'})

        params = request.json
        df = app_data['df']
        original_rows = len(df)

        remover = CombinedDuplicateRemover(
            similarity_threshold=float(params.get('similarity_threshold', 0.9))
        )

        text_cols = params.get('text_columns', [])
        numeric_cols = params.get('numeric_columns', [])

        cleaned_df, removed_count = remover.clean_dataset(
            df,
            text_columns=text_cols if text_cols else None,
            numeric_columns=numeric_cols if numeric_cols else None,
            method=params.get('method', 'kmeans')
        )
        app_data['cleaned_df'] = cleaned_df

        preview_html = cleaned_df.head(10).to_html(classes='table table-striped table-hover', index=False)

        return jsonify({
            'success': True,
            'original_rows': original_rows,
            'cleaned_rows': len(cleaned_df),
            'removed_count': removed_count,
            'preview': preview_html
        })
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)})


@app.route('/download')
def download():
    try:
        if app_data['cleaned_df'] is None:
            return "No cleaned data available", 404

        buffer = io.BytesIO()
        app_data['cleaned_df'].to_csv(buffer, index=False)
        buffer.seek(0)

        return send_file(
            buffer,
            mimetype='text/csv',
            as_attachment=True,
            download_name='cleaned_dataset.csv'
        )
    except Exception as e:
        return str(e), 500


@app.route('/visualize_clusters', methods=['POST'])
def visualize_clusters():
    try:
        if app_data['df'] is None:
            return jsonify({'success': False, 'error': 'No data loaded'})

        params = request.json
        numeric_cols = params.get('numeric_columns', [])

        if not numeric_cols:
            numeric_cols = app_data['df'].select_dtypes(include=[np.number]).columns.tolist()

        if len(numeric_cols) < 2:
            return jsonify({'success': False, 'error': 'Need at least 2 numeric columns'})

        remover = TabularDuplicateRemover()
        fig = remover.visualize_clusters(app_data['df'], numeric_cols)

        if fig:
            img_base64 = fig_to_base64(fig)
            return jsonify({'success': True, 'image': img_base64})
        else:
            return jsonify({'success': False, 'error': 'Could not generate visualization'})
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)})


@app.route('/compare_models', methods=['POST'])
def compare_models():
    try:
        if app_data['df'] is None:
            return jsonify({'success': False, 'error': 'No data loaded'})

        params = request.json
        numeric_cols = params.get('numeric_columns', [])

        if not numeric_cols:
            numeric_cols = app_data['df'].select_dtypes(include=[np.number]).columns.tolist()

        detector = ClassificationDuplicateDetector()
        results = detector.compare_models(app_data['df'], numeric_cols)

        if results:
            fig = detector.plot_model_comparison(results)
            img_base64 = fig_to_base64(fig) if fig else None
            return jsonify({
                'success': True,
                'results': results,
                'image': img_base64
            })
        else:
            return jsonify({'success': False, 'error': 'Could not compare models'})
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)})


if __name__ == '__main__':
    app.run(debug=False, port=5000)

Writing flask_app.py


In [None]:
# =============================================================================
# CELL 6: Alternative - Run Streamlit with ngrok (Requires free signup)
# =============================================================================


from pyngrok import ngrok
import subprocess
import time

# Set your ngrok auth token (get free token from https://ngrok.com/)
ngrok.set_auth_token("YOUR NGROK TOKEN")

# Kill any existing processes
!pkill -f streamlit
!pkill -f ngrok

# # Start Streamlit
streamlit_process = subprocess.Popen(
     ['streamlit', 'run', 'streamlit_app.py', '--server.port', '8501', '--server.headless', 'true'],
     stdout=subprocess.PIPE,
     stderr=subprocess.PIPE
 )

time.sleep(5)

 # Create ngrok tunnel
public_url = ngrok.connect(8501)
print(f"\n{'=' * 50}")
print(f"üöÄ Streamlit App is running at: {public_url}")
print(f"{'=' * 50}\n")


üöÄ Streamlit App is running at: NgrokTunnel: "https://unprodded-yevette-philosophically.ngrok-free.dev" -> "http://localhost:8501"



In [None]:
# =============================================================================
# CELL 7: Run Flask Application
# =============================================================================

# Method 1: Using flask-ngrok (Recommended)
print("Starting Flask application...")
print("=" * 50)

# For Flask with ngrok
from pyngrok import ngrok
import threading

# Uncomment the line below and add your ngrok token
ngrok.set_auth_token("36mwqliJIfWr7EXqhrdpuPoCr9w_sfvUA2Dxpr4oP7gShMmE")

# Import Flask app
from flask_app import app

# Start ngrok tunnel
public_url = ngrok.connect(5000)
print(f"\nüöÄ Flask App is running at: {public_url}")
print("=" * 50)

# Run Flask app
app.run(port=5000)

Starting Flask application...


ModuleNotFoundError: No module named 'flask_app'