<a href="https://www.kaggle.com/code/avtnshm/inx-future-inc-kpis-random-forest?scriptVersionId=146540974" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

#####  Mr Brain, CEO of the INX Future Inc data analytics and automation solutions provider wants us to create an interactive dashboard to analyse the employee performance indexes and track the core underlying causes of their performance issues (clear indicators of non-performing employees).

### Questions (KPIs)
 - Does educational level responsible for the attrition of employees?
- Which age group of employees has the maximum attrition?
- Are the employees unsatisfied with the specific job role?
- Which top 5 job role indicates the maximum attrition of employees?
- Does a highly experienced person responsible for leaving the company?
- How does the % salary hike play a crucial role in the increase in attrition of the employee?

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")

In [None]:
df= pd.read_excel("/kaggle/input/employee-performance-analysis-inx-future-inc/INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls")
df.head()

In [None]:
df.info()

### Education level vs Attrition

In [None]:
df.groupby('EmpEducationLevel')['Attrition'].value_counts().unstack().plot(kind='bar', stacked=True)

### Age Group vs attrition

In [None]:
df['AgeGroup'] = pd.cut(df['Age'], [18, 25, 35, 45, 55, 65], labels=['18-24', '25-34', '35-44', '45-54', '55-64'])
max_attrition_age_group = df.loc[df['Attrition'] == 'Yes', 'AgeGroup'].value_counts().idxmax()
max_attrition_age_group

### Job role  vs Satisfaction

In [None]:
df.groupby('EmpJobRole')['EmpJobSatisfaction'].median().plot(kind='bar')

In [None]:
df['SatisfactionCategory'] = pd.cut(df['EmpJobSatisfaction'], bins=[1, 2, 3, 4], labels=['Not Satisfied', 'OK', 'Highly Satisfied'])
job_role_satisfaction = df.groupby('EmpJobRole')['SatisfactionCategory'].value_counts().unstack(fill_value=0)
# Create a pie chart for each job role
for job_role in job_role_satisfaction.index:
    plt.figure()
    plt.pie(job_role_satisfaction.loc[job_role], labels=job_role_satisfaction.columns, autopct='%1.1f%%')
    plt.title(job_role)
    plt.show()

In [None]:
satisfaction = df.groupby(['EmpJobRole', 'SatisfactionCategory'])['SatisfactionCategory'].count().unstack(fill_value=0)
satisfaction.div(satisfaction.sum(axis=1), axis=0).mul(100).plot(kind='bar', stacked=True, figsize=(10, 6))

### Job role vs attirition

In [None]:
top_5_attrition_jobs = df.groupby('EmpJobRole')['Attrition'].count().sort_values(ascending=False).head(5)
print(top_5_attrition_jobs)

### Experience vs attiriton

In [None]:
# Calculate the statistics
exp_statistics = df[['TotalWorkExperienceInYears', 'ExperienceYearsAtThisCompany', 'ExperienceYearsInCurrentRole']].agg(['mean', 'median', 'max'])
exp_statistics

In [None]:
# Calculate the average experience for employees who left
avg_experience_left = df[df['Attrition'] == 'Yes'][['TotalWorkExperienceInYears', 'ExperienceYearsAtThisCompany', 'ExperienceYearsInCurrentRole']].mean()

# Calculate the average experience for employees who didn't leave
avg_experience_not_left = df[df['Attrition'] == 'No'][['TotalWorkExperienceInYears', 'ExperienceYearsAtThisCompany', 'ExperienceYearsInCurrentRole']].mean()

# Check if highly experienced people (e.g., average experience > threshold) are responsible for leaving
threshold = 7 # As the mean of ExperienceYearsAtThisCompany is 7
highly_experienced_leaving = avg_experience_left.mean() > threshold

highly_experienced_leaving


#### So we can conclude that experienced and highly experince people are NOT responsible for attirition

 ### Salary Hike vs Attrition

In [None]:
bins = range(0, 26, 5)
df['SalaryHikeInterval'] = pd.cut(df['EmpLastSalaryHikePercent'], bins)
attrition_by_hike_interval = df.groupby('SalaryHikeInterval')['Attrition'].value_counts().unstack(fill_value=0)
attrition_by_hike_interval

In [None]:
attrition_by_hike_interval.plot(kind='bar', stacked=True)
plt.xlabel('Salary Hike Interval')
plt.ylabel('Count')
plt.title('Attrition by Salary Hike Interval')
plt.legend(title='Attrition')
plt.show()

In [None]:
categorical_columns = df.select_dtypes(include=['object'])
categorical_columns

In [None]:
categorical_columns = df.select_dtypes(include=['object'])
categorical_columns

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])


In [None]:
df

In [None]:
df.drop(columns=['EmpNumber','AgeGroup', 'SatisfactionCategory', 'SalaryHikeInterval'], inplace=True)
df

In [None]:
X= df.drop('Attrition', axis=1)
y=df['Attrition']

In [None]:
Train = []
Test = []
CV= [] #Cross Validation


for i in range (0,10): #if random state is more than, run till 20, if less than 10, use it(keep increasing nos progressivley)
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    from sklearn.linear_model import LogisticRegression
    m = LogisticRegression()
    m.fit(X_train, y_train)
    ypred_train = m.predict(X_train)
    ypred_test = m.predict(X_test)
    Train.append(m.score(X_train, y_train))
    Test.append(m.score(X_test, y_test))
    from sklearn.model_selection import cross_val_score
    CV.append(cross_val_score(m, X, y, cv= 5). mean()) #cv=5, since train test split is 80:20 or five parts
    

em= pd.DataFrame({"Train": Train, "Test":Test, "CV":CV})
gm = em[(abs(em['Train']- em['Test'])<=0.05) & (abs(em['Test']- em['CV'])<=0.05)]
print("best random_state_number:", gm[gm["Test"]==gm["Test"].max()].index.to_list()[0])

In [None]:
#Random Forest Classifier wiht default parameters

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=1)
model.fit(X_train,y_train)

#Prediction
ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)


#Evaluation

from sklearn.metrics import accuracy_score
print("Train Accuracy:", accuracy_score(ypred_train, y_train))
print("Test Accuracy:", accuracy_score(ypred_test, y_test))
from sklearn.model_selection import cross_val_score
print("Cross Validation Score:", cross_val_score(model,X,y, cv=5).mean())
