In [None]:
import pandas as pd
import os
import sys
import re

# Function to validate email addresses using regex
def is_valid_email(email):
    """
    Validate the given email address using a regular expression.
    
    Parameters:
    email (str): The email address to validate.
    
    Returns:
    bool: True if the email is valid, False otherwise.
    """
    email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
    return re.match(email_regex, email) is not None

# Read the initial CSV file with separator modification and low memory usage
try:
    df = pd.read_csv("JapanLifeBear.csv", sep=';', low_memory=True)
    if df.empty:
        print("Warning: The input CSV file is empty.")
    else:
        print("CSV file loaded successfully.")
except FileNotFoundError:
    print("Error: The file 'JapanLifeBear.csv' was not found.")
    sys.exit(1)
except pd.errors.EmptyDataError:
    print("Error: The file is empty.")
    sys.exit(1)
except Exception as e:
    print(f"An error occurred while reading the file: {e}")
    sys.exit(1)

# Drop rows with missing values in 'birthday_on' and 'gender' columns
df = df.dropna(subset=['birthday_on', 'gender'])

# Rename columns
df.rename(columns={'mail_address': 'email', 'birthday_on': 'date_of_birth'}, inplace=True)

# Filter valid emails
df['email'] = df['email'].astype(str)  # Ensure email column is string type
df_valid_emails = df[df['email'].apply(is_valid_email)]  # Retain only valid emails
df_invalid_emails = df[~df['email'].apply(is_valid_email)]  # Retain only invalid emails

# Function to save a DataFrame to a CSV file
def save_dataframe_to_csv(df, file_path):
    """
    Saves a pandas DataFrame to a CSV file at the specified file path.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to save.
    file_path (str): The file path where the CSV file will be saved.
    """
    try:
        sys.setrecursionlimit(10000)  # Increase recursion limit if necessary (use caution)
        
        if df.empty:
            print("Warning: The DataFrame is empty. No file will be saved.")
        else:
            df.to_csv(file_path, index=False)
            print(f"File saved successfully to {file_path}")
    except RecursionError as rec_err:
        print(f"Recursion error encountered: {rec_err}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Ensure the garbage folder exists 
garbage_folder = r"C:\Users\PROTEXXA\Desktop\LifeBear\LifeBearGarbage"
if not os.path.exists(garbage_folder):
    os.makedirs(garbage_folder)

# Save invalid emails to the garbage folder 
garbage_file_path = os.path.join(garbage_folder, "Invalid_emails.csv")
save_dataframe_to_csv(df_invalid_emails, garbage_file_path)

# Remove duplicates based on 'email' and 'login_id'
df_duplicates = df_valid_emails[df_valid_emails.duplicated(subset=['email', 'login_id'], keep=False)]  # Find duplicates
df_valid_emails = df_valid_emails.drop_duplicates(subset=['email', 'login_id'], keep='first')  # Remove duplicates

# Save the duplicates to the garbage folder
duplicates_file_path = os.path.join(garbage_folder, "Duplicate_entries.csv")
save_dataframe_to_csv(df_duplicates, duplicates_file_path)

# Save the DataFrame with valid and unique entries to the file
cleaned_file_path = r'C:\Users\PROTEXXA\Desktop\LifeBear\CleanedLifeBear\Clean_Data4.csv'  # Use raw string for file paths
save_dataframe_to_csv(df_valid_emails, cleaned_file_path)

print("Duplicate removal and saving process completed.")
