# Setup

In [None]:
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

df = pd.read_csv('/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
                    

# Basic data info

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.columns

# Plotting

### Monthly income

In [None]:
plt.figure(figsize=(12, 4))

sns.histplot(df, x="MonthlyIncome")

### Age

In [None]:
plt.figure(figsize=(12, 4))

sns.histplot(df, x="Age")

### Age and monthly income

In [None]:
sns.lineplot(df, x="Age", y="MonthlyIncome")

plt.title("Monthly Income by Age")

In [None]:
sns.countplot(df, x="Department")

In [None]:
df.columns

In [None]:
plt.figure(figsize=(18,9))

# Calculate the correlation matrix for all columns (including non-numeric ones)
df_corr_matrix = df.corr(numeric_only = True)

# Drop unnecessary columns from the correlation matrix (EmployeeCount and StandardHours)
df_corr_matrix = df_corr_matrix.drop(["EmployeeCount", "StandardHours"], axis=1)
df_corr_matrix = df_corr_matrix.drop(["EmployeeCount", "StandardHours"], axis=0)

sns.heatmap(df_corr_matrix, annot=True)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(18, 9))
ax = sns.countplot(data=df, x="EducationField")

# Add data labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.0f}', (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=12)

plt.xlabel("Education Field")
plt.ylabel("Count")
plt.title("Count of Education Field")
plt.show()

In [None]:
pd.set_option('display.max_columns', None)

sns.scatterplot(df, y="MonthlyIncome", x="Age")

### Showing two plots

In [None]:
# Create a 1x2 grid layout
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Plot the histogram on the left (1st cell)
sns.histplot(data=df, x="MonthlyIncome", kde=True, ax=axes[0])
axes[0].set_title("Histogram")

# Plot the line plot on the right (2nd cell)
sns.lineplot(data=df, y="MonthlyIncome", x="DistanceFromHome", ax=axes[1])
axes[1].set_title("Line Plot")

plt.show()

# Machine learning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Prepare the data
# Assuming 'df' is the DataFrame containing your data
X = df.drop(columns=['Attrition'])
y = df['Attrition']

# Convert categorical variables to numerical representations using one-hot encoding
X_encoded = pd.get_dummies(X)

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Step 3: Choose the model (Random Forest)
model = RandomForestClassifier(random_state=42)

# Step 4: Train the model
model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(classification_rep)