In [None]:
import pandas as pd
import numpy as np

# Load the dataset (replace with actual file path)
file_path = "C:/Users/A/Desktop/bankdataset/bankdataset.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# Display basic info before cleaning
print("Initial Data Info:")
print(df.info())
print(df.head())

# 1. Handling Missing Values
print("Missing values before cleaning:")
print(df.isnull().sum())

# Drop columns with too many missing values (e.g., more than 50%)
thresh = len(df) * 0.5  # Threshold: at least 50% non-null values
df = df.dropna(thresh=thresh, axis=1)

# Fill missing values
for col in df.columns:
    if df[col].dtype == 'object':  # Categorical columns
        df[col] = df[col].fillna(df[col].mode()[0])  # Fill with most frequent value
    else:  # Numerical columns
        df[col] = df[col].fillna(df[col].median())  # Fill with median value

# 2. Removing Duplicates
df.drop_duplicates(inplace=True)

# 3. Formatting Data Types
# Convert date columns to datetime format
for col in df.columns:
    if 'date' in col.lower():  # Assuming column names contain 'date'
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Convert categorical variables to category type
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].astype('category')

# Convert numerical columns to appropriate types
for col in df.select_dtypes(include=['object']).columns:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        pass

# 4. Handling Inconsistencies and Errors
# Standardizing categorical values (trimming whitespace, fixing cases)
for col in categorical_cols:
    df[col] = df[col].str.strip().str.lower()

print("Data Cleaning Completed.")

# Save the cleaned DataFrame to a new CSV file
cleaned_file_path = "C:/Users/A/Desktop/bankdataset/cleaned_bankdataset.csv"  # Update with your desired file path
df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned data saved to {cleaned_file_path}")


Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1004480 entries, 0 to 1004479
Data columns (total 5 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   Date               1004480 non-null  object
 1   Domain             1004480 non-null  object
 2   Location           1004480 non-null  object
 3   Value              1004480 non-null  int64 
 4   Transaction_count  1004480 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 38.3+ MB
None
       Date         Domain  Location   Value  Transaction_count
0  1/1/2022      RESTRAUNT      Bhuj  365554               1932
1  1/1/2022    INVESTMENTS  Ludhiana  847444               1721
2  1/1/2022         RETAIL       Goa  786941               1573
3  1/1/2022  INTERNATIONAL   Mathura  368610               2049
4  1/1/2022      RESTRAUNT   Madurai  615681               1519
Missing values before cleaning:
Date                 0
Domain               0
Location