# Data Cleaning Notebook

This notebook handles the cleaning of the `career_recommender.csv` dataset.
It addresses issues with parsing (extra delimiters) and standardizes column names.

In [None]:
import pandas as pd
import os

# Define paths
# Assuming the notebook is running from the 'notebooks' directory, we go up one level to root
# But absolute paths are safer if we know them, or relative to the project root if we set working dir.
# Let's use the absolute path used in the script for consistency, or relative to project root.

input_path = r'D:\Data Science advance projects\Career Path Recommender\Dataset\career recommendation dataset\career_recommender.csv'
output_path = os.path.join(os.path.dirname(input_path), 'cleaned_career_recommender.csv')

print(f"Input file: {input_path}")
print(f"Output file: {output_path}")

In [None]:
# Try reading with skipinitialspace=True to handle spaces before quotes
try:
    print('Attempting to read with skipinitialspace=True...')
    df = pd.read_csv(input_path, skipinitialspace=True)
    
    print('Successfully read CSV with skipinitialspace=True')
    print(f'Shape: {df.shape}')
    print(f'Columns ({len(df.columns)}):')
    for i, col in enumerate(df.columns):
        print(f'{i}: {col}')
    
    # Standardize column names
    new_columns = [
        'Name', 
        'Gender', 
        'UG_Course', 
        'UG_Specialization', 
        'Interests', 
        'Skills', 
        'CGPA', 
        'Certifications', 
        'Certification_Title', 
        'Working', 
        'Job_Title', 
        'Masters'
    ]
    
    if len(df.columns) == len(new_columns):
        df.columns = new_columns
        print('\nRenamed columns successfully.')
        
        # Clean up the data
        # Remove any leading/trailing whitespace from string columns
        # Using map instead of applymap as applymap is deprecated in newer pandas
        df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
        
        # Save the cleaned dataset
        df.to_csv(output_path, index=False)
        print(f'\nSaved cleaned dataset to: {output_path}')
        
        # Show sample
        print('\nSample Data:')
        display(df[['Name', 'Skills', 'Certification_Title']].head())
        
    else:
        print(f'\nColumn count mismatch. Expected {len(new_columns)}, got {len(df.columns)}')
        
except Exception as e:
    print(f'Error reading with skipinitialspace=True: {e}')