In [2]:
import pandas as pd
import numpy as np

In [4]:
# Read the original CSV
data_path = "bc_data.csv"
df = pd.read_csv(data_path)


# 1. Introduce incorrect data types (e.g., converting a numerical column to a string)
numeric_columns = df.select_dtypes(include=np.number).columns
if len(numeric_columns) > 0:
    # Convert a numeric column to string type (introducing type inconsistency)
    column_to_modify = numeric_columns[2]
    df[column_to_modify] = df[column_to_modify].astype(str)
    print(f"Column '{column_to_modify}' converted to str data type")

# 2. Introduce missing values for entire rows
nan_percentage = 0.05  # 10% of the data will be set to NaN
nan_indices = np.random.choice(df.index, size=int(len(df) * nan_percentage), replace=False)
df.loc[nan_indices, :] = np.nan

# 3. Introduce missing values (randomly select rows and columns)
num_nans = 10  # Set how many NaN values you want to introduce
random_indices = np.random.choice(df.index, size=num_nans, replace=False)  # Random row indices
random_columns = np.random.choice(df.columns, size=num_nans, replace=False)  # Random columns
for idx, col in zip(random_indices, random_columns):
    df.loc[idx, col] = np.nan

# 4. Add duplicate rows (randomly choose some rows and append them to the DataFrame)
duplicates = df.sample(frac=0.1, random_state=42)  # 10% of the data will be duplicated
df = pd.concat([df, duplicates], ignore_index=True)

# 5. Introduce noisy data (typos or irrelevant values in a categorical column)
categorical_columns = df.select_dtypes(include=['object']).columns
if len(categorical_columns) > 0:
    cat_column = categorical_columns[0]
    noisy_indices = np.random.choice(df.index, size=5, replace=False)
    df.loc[noisy_indices, cat_column] = ['WrongValue' for _ in noisy_indices]

# Save the dirtified dataset to a new CSV for further use
df.to_csv('bc_data_dirtified.csv', index=False)

# Display the first few rows of the dirtified dataset
df.head()


Column 'texture_mean' converted to str data type


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,,,,,,,,,,,...,,,,,,,,,,
1,842517.0,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903.0,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301.0,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402.0,M,,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [5]:
# 1. Check for incorrect data types (numeric columns converted to string)
print("1.")
string_columns = df.select_dtypes(include=['object']).columns
for col in numeric_columns:
    if col in string_columns:
        print(f"Column '{col}' has been incorrectly converted to string.")

# 2.  Check if any row has all NaN values
nan_rows = df[df.isna().all(axis=1)]
print(f"\n2. Number of rows with all NaN values: {len(nan_rows)}")

# 3. Count total NaN values in the DataFrame
total_nans = df.isna().sum().sum()
print(f"\n3. Total NaN values in the DataFrame: {total_nans}")

# 4. Check for duplicated rows
duplicates = df[df.duplicated()]
print(f"\n4. Number of duplicated rows (excluding the first occurrence): {len(duplicates)}")

# 5. Check for noisy data ('WrongValue' in categorical columns)
noisy_data_check = df.isin(['WrongValue']).sum()
print("\n5. Occurrences of 'WrongValue' in categorical columns:")
print(noisy_data_check)



1.
Column 'texture_mean' has been incorrectly converted to string.

2. Number of rows with all NaN values: 30

3. Total NaN values in the DataFrame: 1596

4. Number of duplicated rows (excluding the first occurrence): 82

5. Occurrences of 'WrongValue' in categorical columns:
id                         0
diagnosis                  5
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            

In [None]:
import zipfile

# Zip the CSV file
zip_file_path = '/content/data/thyroid_cancer_risk_data_dirtified.zip'
csv_file_path = '/content/data/thyroid_cancer_risk_data_dirtified.csv'
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_file_path, arcname='thyroid_cancer_risk_data_dirtified.csv')

print(f"CSV file zipped to: {zip_file_path}")

CSV file zipped to: /content/data/thyroid_cancer_risk_data_dirtified.zip
