<a href="https://colab.research.google.com/github/Bimalv01/python/blob/main/corrupt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the file path
file_path = '/content/drive/MyDrive/Colab Notebooks/feedback_dataset.csv'
output_path = '/content/drive/MyDrive/Colab Notebooks/cuurrupted_feedback_dataset.csv'

def corrupt_dataset(file_path, output_path):
    # Load the dataset with the correct delimiter
    try:
        df = pd.read_csv(file_path, delimiter='\t', on_bad_lines='skip')
    except pd.errors.ParserError as e:
        print(f"Error reading the file: {e}")
        return

    # Set a random seed for reproducibility
    np.random.seed(42)

    # Introduce missing values
    def introduce_missing_values(df, fraction=0.1):
        df_corrupted = df.copy()
        for col in df_corrupted.columns:
            df_corrupted.loc[df_corrupted.sample(frac=fraction).index, col] = np.nan
        return df_corrupted

    # Introduce duplicate rows
    def introduce_duplicates(df, fraction=0.1):
        df_corrupted = df.copy()
        n_duplicates = int(len(df) * fraction)
        duplicates = df.sample(n=n_duplicates, replace=True)
        df_corrupted = pd.concat([df_corrupted, duplicates], ignore_index=True)
        return df_corrupted

    # Introduce random error values
    def introduce_errors(df, fraction=0.05):
        df_corrupted = df.copy()
        for col in df_corrupted.select_dtypes(include=[np.number]).columns:
            error_indices = df_corrupted.sample(frac=fraction).index
            df_corrupted.loc[error_indices, col] = df_corrupted.loc[error_indices, col] * random.uniform(1.5, 2.0)
        return df_corrupted

    # Introduce outliers
    def introduce_outliers(df, fraction=0.01):
        df_corrupted = df.copy()
        for col in df_corrupted.select_dtypes(include=[np.number]).columns:
            outlier_indices = df_corrupted.sample(frac=fraction).index
            df_corrupted.loc[outlier_indices, col] = df_corrupted.loc[outlier_indices, col] * random.uniform(5.0, 10.0)
        return df_corrupted

    # Introduce incorrect data types
    def introduce_incorrect_datatypes(df, fraction=0.05):
        df_corrupted = df.copy()
        for col in df_corrupted.select_dtypes(include=[np.number]).columns:
            str_indices = df_corrupted.sample(frac=fraction).index
            df_corrupted.loc[str_indices, col] = df_corrupted.loc[str_indices, col].astype(str) + "_error"
        return df_corrupted

    # Introduce inconsistent date formats
    def introduce_date_format_errors(df, date_col, fraction=0.1):
        df_corrupted = df.copy()
        date_formats = ['%Y-%m-%d', '%d/%m/%Y', '%m-%d-%Y']
        df_corrupted[date_col] = pd.to_datetime(df_corrupted[date_col], errors='coerce')
        date_indices = df_corrupted.sample(frac=fraction).index
        for idx in date_indices:
            df_corrupted.at[idx, date_col] = (df_corrupted.at[idx, date_col] + timedelta(days=random.randint(0, 365))).strftime(random.choice(date_formats))
        return df_corrupted

    # Apply corruptions
    df_corrupted = introduce_missing_values(df)
    df_corrupted = introduce_duplicates(df_corrupted)
    df_corrupted = introduce_errors(df_corrupted)
    df_corrupted = introduce_outliers(df_corrupted)
    df_corrupted = introduce_incorrect_datatypes(df_corrupted)

    # If there's a date column, introduce date format errors
    date_column = None
    for col in df.columns:
        if pd.api.types.is_datetime64_any_dtype(df[col]) or pd.api.types.is_string_dtype(df[col]):
            try:
                pd.to_datetime(df[col])
                date_column = col
                break
            except:
                continue
    if date_column:
        df_corrupted = introduce_date_format_errors(df_corrupted, date_column)

    # Save the corrupted dataset
    df_corrupted.to_csv(output_path, index=False)
    print(f"Corrupted dataset saved to {output_path}")

# Corrupt the dataset
corrupt_dataset(file_path, output_path)

# Download the corrupted dataset
from google.colab import files
files.download(output_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Corrupted dataset saved to /content/drive/MyDrive/Colab Notebooks/cuurrupted_feedback_dataset.csv


  pd.to_datetime(df[col])


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>