In [55]:
import pandas as pd

# Load the dataset
fiche_a ='/content/drive/MyDrive/Colab Notebooks/company.csv'
df = pd.read_csv(fiche_a)



In [56]:
import numpy as np

# Handle missing values
df.fillna({
    'department': 'Unknown',
    'salary': '0',
    'years_at_company': '0',
    'age': df['age'].median(),
    'job_satisfaction': df['job_satisfaction'].median(),
    'performance_score': df['performance_score'].median(),
    'last_promotion_year': df['last_promotion_year'].mode()[0],
    'education_level': 'Unknown',
    'gender': 'Unknown',
    'hired_date': '01-01-1970'
}, inplace=True)

# Standardize the salary column
df['salary'] = df['salary'].replace('[\$,K]', '', regex=True).astype(float)*1000

# Transform gender to 'M' if 'Male' and 'F' if 'Female'
df['gender'] = df['gender'].replace({'Male': 'M', 'Female': 'F'})

# Standardize the years_at_company column
df['years_at_company'] = df['years_at_company'].replace({'years': '', ' ': ''}, regex=True).astype(float)

# Convert hired_date to datetime
df['hired_date'] = pd.to_datetime(df['hired_date'], errors='coerce', dayfirst=True)

# Convert last_promotion_year to datetime and extract the year
df['last_promotion_year'] = pd.to_datetime(df['last_promotion_year'], format='%Y').dt.year

# Convert age to integer
df['age'] = df['age'].astype(int)

# Verify the transformations
df.info()

# Display the cleaned data
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   employee_id          47 non-null     int64         
 1   department           47 non-null     object        
 2   salary               47 non-null     float64       
 3   years_at_company     47 non-null     float64       
 4   age                  47 non-null     int64         
 5   job_satisfaction     47 non-null     int64         
 6   performance_score    47 non-null     int64         
 7   last_promotion_year  47 non-null     int32         
 8   education_level      47 non-null     object        
 9   gender               47 non-null     object        
 10  hired_date           4 non-null      datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int32(1), int64(4), object(3)
memory usage: 4.0+ KB


Unnamed: 0,employee_id,department,salary,years_at_company,age,job_satisfaction,performance_score,last_promotion_year,education_level,gender,hired_date
0,1,Sales,60000000.0,5.0,30,7,85,2018,Bachelor,M,2015-06-15
1,2,Engineering,95000.0,8.0,35,8,90,2017,Master,F,NaT
2,3,HR,45000.0,2.0,28,6,70,2020,Bachelor,F,2019-04-23
3,4,Marketing,75000.0,10.0,40,9,95,2015,Master,M,NaT
4,5,IT,50000000.0,3.0,25,5,60,2011,Bachelor,M,NaT


In [65]:

# Repatisyon chak anplwaye nan chak depatman
department_distribution = df['department'].value_counts().reset_index()
department_distribution.columns = ['department', 'count']

# Repatisyon pa sèks nan chak depatman
gender_distribution_per_department = df.groupby('department')['gender'].value_counts().unstack().fillna(0).reset_index()

# Mwayèn laj anplwaye yo pou chak depatman
average_age_per_department = df.groupby('department')['age'].mean().reset_index()
average_age_per_department.columns = ['department', 'average_age']

# Display results
department_distribution



Unnamed: 0,department,count
0,IT,11
1,Sales,9
2,Engineering,9
3,HR,9
4,Marketing,9


In [66]:
average_age_per_department

Unnamed: 0,department,average_age
0,Engineering,32.777778
1,HR,25.444444
2,IT,26.090909
3,Marketing,38.0
4,Sales,29.444444


In [67]:
gender_distribution_per_department

gender,department,F,M
0,Engineering,3,6
1,HR,8,1
2,IT,4,7
3,Marketing,4,5
4,Sales,2,7


In [58]:

import seaborn as sns
import pandas as pd

# Repatisyon pa sèks nan chak depatman
gender_distribution_melted = gender_distribution_per_department.melt(id_vars='department', value_vars=['F', 'M'], var_name='gender', value_name='count')


gender_distribution_melted


Unnamed: 0,department,gender,count
0,Engineering,F,3
1,HR,F,8
2,IT,F,4
3,Marketing,F,4
4,Sales,F,2
5,Engineering,M,6
6,HR,M,1
7,IT,M,7
8,Marketing,M,5
9,Sales,M,7


In [61]:
# Calculate the average salary and job satisfaction per department

# Mwayèn salè nan chak depatman
average_salary_per_department = df.groupby('department')['salary'].mean().reset_index()
average_salary_per_department.columns = ['department', 'average_salary']

# Satisfaksyon travay sou chak depatman
job_satisfaction_per_department = df.groupby('department')['job_satisfaction'].mean().reset_index()
job_satisfaction_per_department.columns = ['department', 'average_job_satisfaction']

job_satisfaction_per_department


Unnamed: 0,department,average_job_satisfaction
0,Engineering,7.222222
1,HR,4.777778
2,IT,5.090909
3,Marketing,8.222222
4,Sales,6.222222


In [62]:
average_salary_per_department


Unnamed: 0,department,average_salary
0,Engineering,29806670.0
1,HR,47009440.0
2,IT,45463640.0
3,Marketing,42025890.0
4,Sales,43237000.0


In [71]:
# Calculate the average time since last promotion per department
current_year = pd.to_datetime('now').year
df['years_since_last_promotion'] = current_year - df['last_promotion_year']
average_years_since_promotion_per_department = df.groupby('department')['years_since_last_promotion'].mean().reset_index()
average_years_since_promotion_per_department.columns = ['department', 'average_years_since_last_promotion']

# Calculate the average salary based on education level
average_salary_per_education_level = df.groupby('education_level')['salary'].mean().reset_index()
average_salary_per_education_level.columns = ['education_level', 'average_salary']




average_years_since_promotion_per_department


Unnamed: 0,department,average_years_since_last_promotion
0,Engineering,5.888889
1,HR,2.333333
2,IT,5.090909
3,Marketing,7.777778
4,Sales,4.333333


Unnamed: 0,department,average_years_since_last_promotion
0,Engineering,5.888889
1,HR,2.333333
2,IT,5.090909
3,Marketing,7.777778
4,Sales,4.333333


In [70]:
average_salary_per_education_level

Unnamed: 0,education_level,average_salary
0,Bachelor,51487.08
1,Master,47311.15
2,Unknown,68000.0


Unnamed: 0,education_level,average_salary
0,Bachelor,44574000.0
1,Master,35423050.0
2,Unknown,68000000.0
