In [8]:
import pandas as pd

# Load the Activities dataset
def load_activities(file_path):
    # Replace file_path with the path to your CSV or Excel file
    df = pd.read_csv('/content/HR Data.csv')  # Use read_excel for .xlsx files
    return df

# Clean the dataset
def clean_activities(df):
    # 1. Remove unnecessary columns
    columns_to_remove = ['EmployeeCount', 'Over18']
    df = df.drop(columns=columns_to_remove, errors='ignore')

    # 2. Rename columns
    df = df.rename(columns={
        'Age': 'Age',
        'Attrition': 'Attrition',
        'BusinessTravel': 'Business_Travel',
        'DailyRate': 'Daily_Rate',
        'Department': 'Department',
        'DistanceFromHome': 'Distance_From_Home',
        'Education': 'Education',
        'EducationField': 'Education_Field',
        'EnvironmentSatisfaction': 'Environment_Satisfaction',
        'Gender': 'Gender',
        'HourlyRate': 'Hourly_Rate',
        'JobInvolvement': 'Job_Involvement',
        'JobLevel': 'Job_Level',
        'JobRole': 'Job_Role',
        'JobSatisfaction': 'Job_Satisfaction',
        'MaritalStatus': 'Marital_Status',
        'MonthlyIncome': 'Monthly_Income',
        'MonthlyRate': 'Monthly_Rate',
        'NumCompaniesWorked': 'Num_Companies_Worked',
        'OverTime': 'Over_Time',
        'PercentSalaryHike': 'Percent_Salary_Hike',
        'PerformanceRating': 'Performance_Rating',
        'RelationshipSatisfaction': 'Relationship_Satisfaction',
        'StockOptionLevel': 'Stock_Option_Level',
        'TotalWorkingYears': 'Total_Working_Years',
        'TrainingTimesLastYear': 'Training_Times_LastYear',
        'WorkLifeBalance': 'WorkLife_Balance',
        'YearsAtCompany': 'Years_At_Company',
        'YearsInCurrentRole': 'Years_In_Current_Role',
        'YearsSinceLastPromotion': 'Years_Since_Last_Promotion',
        'YearsWithCurrManager': 'Years_With_Current_Manager'
    })  # Update if you want to change column names

    # 3. Eliminate redundant entries
    df = df.drop_duplicates()

    # 4. Sanitize specific columns (e.g., remove leading/trailing spaces)
    columns_to_sanitize = ['Business_Travel', 'Department', 'Education_Field', 'Gender', 'Job_Role', 'Marital_Status', 'Over_Time']  # Adjust as needed
    for column in columns_to_sanitize:
        if df[column].dtype == 'object':  # Check if the column is of object type
            df[column] = df[column].str.strip()  # Remove leading and trailing spaces

    # 5. Eliminate NaN values
    df = df.dropna()  # Remove all rows with NaN values
    # Alternatively, you might fill NaNs with specific values if needed:
    # df = df.fillna({'ColumnName': 'DefaultValue'})  # Example for filling NaNs


    # Display the first few rows of the cleaned dataset for inspection
    print(df.head())
    print(df.info())

    return df

# Main function to load and clean the data
def main(file_path):
    # Load the data
    df = load_activities(file_path)

    # Clean the data
    df_cleaned = clean_activities(df)

    # Save the cleaned data to a new file (optional)
    df_cleaned.to_csv('Cleaned_HR_DATA.csv', index=False)  # Save to Excel

if __name__ == "__main__":
    # Update this with your file path
    main('/content/Activities.xlsx')


   Age Attrition    Business_Travel  Daily_Rate              Department  \
0   41       Yes      Travel_Rarely        1102                   Sales   
1   49        No  Travel_Frequently         279  Research & Development   
2   37       Yes      Travel_Rarely        1373  Research & Development   
3   33        No  Travel_Frequently        1392  Research & Development   
4   27        No      Travel_Rarely         591  Research & Development   

   Distance_From_Home  Education Education_Field  EmployeeNumber  \
0                   1          2   Life Sciences               1   
1                   8          1   Life Sciences               2   
2                   2          2           Other               4   
3                   3          4   Life Sciences               5   
4                   2          1         Medical               7   

   Environment_Satisfaction  ... Relationship_Satisfaction  StandardHours  \
0                         2  ...                         1     