In [1]:
import pandas as pd
import os

input_folder = r"C:\Users\ANSHU YADAV\Desktop\Assignment2\Renameaslinkedin"
output_folder = r"C:\Users\ANSHU YADAV\Desktop\Assignment2\CleanedCsvfile"
os.makedirs(output_folder, exist_ok=True)

csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
fallback_encodings = ['utf-8', 'ISO-8859-1', 'latin1', 'cp1252']

for file in csv_files:
    input_path = os.path.join(input_folder, file)

    for encoding in fallback_encodings:
        try:
            # Read raw lines first to locate proper header
            with open(input_path, 'r', encoding=encoding) as f:
                lines = f.readlines()

            header_index = -1
            for i, line in enumerate(lines):
                lower_line = line.strip().lower()
                if "first name" in lower_line and "last name" in lower_line:
                    header_index = i
                    break

            if header_index == -1:
                raise ValueError("No proper header found in file.")

            # Read from detected header line
            df = pd.read_csv(input_path, skiprows=header_index, encoding=encoding)

            # Trim and normalize column names
            df.columns = [col.strip().lower() for col in df.columns]

            # Handle variations
            possible_first_names = ['first name']
            possible_last_names = ['last name']
            possible_companies = ['company']

            if not any(col in df.columns for col in possible_first_names) or not any(col in df.columns for col in possible_last_names):
                raise ValueError("Required columns not found.")

            first_name_col = next(col for col in df.columns if col in possible_first_names)
            last_name_col = next(col for col in df.columns if col in possible_last_names)
            company_col = next((col for col in df.columns if col in possible_companies), None)

            # Clean names
            df['Full Name'] = (df[first_name_col].fillna('') + ' ' + df[last_name_col].fillna('')).str.strip()
            df['Company Name'] = df[company_col].fillna('').str.strip() if company_col else ""

            # Filter and deduplicate
            final_df = df[['Full Name', 'Company Name']]
            final_df = final_df[final_df['Full Name'] != '']
            final_df.drop_duplicates(inplace=True)

            # Save cleaned CSV
            output_path = os.path.join(output_folder, file)
            final_df.to_csv(output_path, index=False)

            print(f"Cleaned and saved: {file} (encoding: {encoding})")
            break  # Stop trying other encodings once successful

        except UnicodeDecodeError as ude:
            continue
            # print(f"Failed {file} with encoding {encoding}: {ude}")
        except Exception as e:
            continue
            # print(f"Error processing {file} with encoding {encoding}: {e}")


Cleaned and saved: Aaditya Raj.csv (encoding: utf-8)
Cleaned and saved: Abhishek Singh.csv (encoding: utf-8)
Cleaned and saved: Aditya Singh.csv (encoding: utf-8)
Cleaned and saved: Afzl Raza.csv (encoding: utf-8)
Cleaned and saved: Ajay Jatav.csv (encoding: utf-8)
Cleaned and saved: Ajit Yadav.csv (encoding: utf-8)
Cleaned and saved: Akanksha Kushwaha.csv (encoding: utf-8)
Cleaned and saved: Alok Raj.csv (encoding: utf-8)
Cleaned and saved: Aman Adarsh.csv (encoding: utf-8)
Cleaned and saved: Aman Singh.csv (encoding: utf-8)
Cleaned and saved: Aman Verma.csv (encoding: utf-8)
Cleaned and saved: Amit Kumar.csv (encoding: utf-8)
Cleaned and saved: Anamika Kumari.csv (encoding: ISO-8859-1)
Cleaned and saved: Anand Pandey.csv (encoding: utf-8)
Cleaned and saved: Anand Singh.csv (encoding: utf-8)
Cleaned and saved: ANOOP KUMAR.csv (encoding: utf-8)
Cleaned and saved: Anshu Kumar.csv (encoding: utf-8)
Cleaned and saved: Anuradha Tiwari.csv (encoding: ISO-8859-1)
Cleaned and saved: Anushri M