# Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score , classification_report , plot_confusion_matrix , plot_precision_recall_curve , plot_roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# EDA

In [None]:
df = pd.read_csv("../input/employee-future-prediction/Employee.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

## Numerical analysis and visualization
Analyzing the "Age"

In [None]:
df['Age'].describe()

In [None]:
df['Age'].mean()

In [None]:
df['Age'].median()

In [None]:
df['Age'].plot(kind='box', vert=False, figsize=(14,6))

In [None]:
df['Age'].plot(kind='density', figsize=(14,6)) # kde

In [None]:
ax = df['Age'].plot(kind='density', figsize=(14,6)) # kde
ax.axvline(df['Age'].mean(), color='red')
ax.axvline(df['Age'].median(), color='green')

In [None]:
ax = df['Age'].plot(kind='hist', figsize=(14,6))
ax.set_ylabel('JoiningYear')
ax.set_xlabel('ExperienceInCurrentDomain')

## Categorical analysis and visualization
ExperienceInCurrentDomain

In [None]:
df.head()

In [None]:
df['ExperienceInCurrentDomain'].value_counts()

In [None]:
df['ExperienceInCurrentDomain'].value_counts().plot(kind='pie', figsize=(6,6))

In [None]:
ax = df['ExperienceInCurrentDomain'].value_counts().plot(kind='bar', figsize=(14,6))

## Relationship between the columns?

Paymentier V.S Leave or Not

In [None]:
corr = df.corr()
corr

In [None]:
fig = plt.figure(figsize=(8,8))
plt.matshow(corr, cmap='RdBu', fignum=fig.number)
plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical');
plt.yticks(range(len(corr.columns)), corr.columns);

In [None]:
# Overview of summary (Turnover V.S. Non-turnover)
LeaveOrNot = df.groupby('LeaveOrNot')
LeaveOrNot.mean()

In [None]:
#Correlation Matrix
corr = df.corr()
corr = (corr)
corr

In [None]:
degree_freedom = len(df[df['LeaveOrNot']==1])

LQ = stats.t.ppf(0.025,degree_freedom)  # Left Quartile

RQ = stats.t.ppf(0.975,degree_freedom)  # Right Quartile

print ('The t-distribution left quartile range is: ' + str(LQ))
print ('The t-distribution right quartile range is: ' + str(RQ))

# Data Preprocessing

## Dealing with data types :¶

In [None]:
df.info()              ## there is 4 features that can be converted from object to int 

In [None]:
df['Education'].value_counts()

In [None]:
df['Education'] = df['Education'].map({'PHD': 2, 'Masters': 1, 'Bachelors': 0})
df['Education'].value_counts()

In [None]:
df['City'].value_counts()

In [None]:
df['City'] = df['City'].map({'Bangalore': 2, 'Pune': 1, 'New Delhi': 0})
df['City'].value_counts()

In [None]:
df['Gender'].value_counts()

In [None]:
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['Gender'].value_counts()

In [None]:
df['EverBenched'].value_counts()

In [None]:
df['EverBenched'] = df['EverBenched'].map({'Yes': 1, 'No': 0})
df['EverBenched'].value_counts()

In [None]:
df.info()

## Dealing with missing data :

In [None]:
df.isnull().sum()             ## there is no missing data

## Dealing with duplicates :

In [None]:
len(df)-len(df.drop_duplicates())

In [None]:
(len(df)-len(df.drop_duplicates()))/len(df)

after inspecting the ratio between the dropped data and the whole data we found out that we would have to drop 40% of our data which will later on affect our model accuracy. So we will not drop the dupulicates.

## Dealing with OutLiers :


In [None]:
df.plot(kind = "box" , subplots = True , figsize = (18,15) ,  layout = (3,3))

We can't remove or deal with outliers here because we are dealing with a classification set of data so, it makes sense to have outliers present in our data.

# Business Questions

In [None]:
df.head()

### - What is the ratio between males and females in our company?

In [None]:
sns.set_theme(style="whitegrid")   # To add lines to the background adding more details
x = round(df['Gender'].value_counts()/df.shape[0]*100,2) # Rounds the value of the percentage of males and females to nearest 2 decimals
x.plot.bar(color ='purple')  # plotting the ratio

The number of males in the company is slightly higher than females 

### What is the ratio between people staying/leaving our company?

In [None]:
Leave_stay = df['LeaveOrNot'].value_counts()
Leave_stay.plot.pie(autopct = '%1.1f%%',shadow=True ,explode = (0, 0.1))

In [None]:
#'PHD': 2, 'Masters': 1, 'Bachelors': 0.
Education_lvl = df['Education'].value_counts()
Education_lvl.plot.pie(autopct = '%1.1f%%',shadow=True ,explode = (0, 0.1, 0))

Expected to have a low percentage of PHDs as it takes 3-7 years to complete

In [None]:
plt.figure(figsize=(12,9))
sns.heatmap(df.corr(), annot = True,cmap='PuOr')

No strong relations between columns

### How many payment tiers do we have in our company and what is the ratio between them?

In [None]:
Payment = df['PaymentTier'].value_counts()
Payment.plot.pie(autopct = '%1.1f%%',shadow=True ,explode = (0, 0.1, 0))

### What is the relationship between payment tiers and experience in current domain with respect to leaving or staying in the company?

In [None]:
sns.catplot(y="Age", x="PaymentTier",hue="LeaveOrNot",data=df,palette="flare")

More people at age from 22.5 to 26 leave the company in payment tier 2 than any other tier

### What is the relation between joining year and leaving/staying in the company?

In [None]:
sns.countplot(data=df,x='JoiningYear',hue='LeaveOrNot',palette=['#432371',"#FAAE7B"])

In 2018 most of the employees left which is a bad sign so, we need to review what went wrong this year in particular.

And in 2012 not many employees left the company.

### What is the relation between an employee being benched and him leaving the company?

In [None]:
sns.countplot(data=df,x='EverBenched',hue='LeaveOrNot',palette=["#7fcdbb","#edf8b1"])

Most Employees who get benched leave ( approximately 45%)

### What is the relation between the city and an employee leaving our company?

In [None]:
sns.countplot(data = df ,x='City',hue='LeaveOrNot',palette=['#432371',"#FAAE7B"])
# {'Bangalore': 2, 'Pune': 1, 'New Delhi': 0}

Most employees that live in Pune leave (more than 50% of employees living there) and it is also the most city that employees leave our company from followed by Bnagalore then New Delhi.

In [None]:
sns.countplot(data = df ,x='Gender',hue='LeaveOrNot',palette=["#7fcdbb","#edf8b1"])

A lot of females tend to leave our company  so, we might have to make the work enviroment easier for them to adapt to.

In [None]:
sns.countplot(data = df ,x='ExperienceInCurrentDomain',hue='LeaveOrNot',palette=['#432371',"#FAAE7B"])

Employees with 2 years of experience in their domain are the most to leave.

## Conclusion
#### Pros:
- Not many employees get benched.
- 23% of our employees have a masters degree.
- Not many employees from New Delhi leave our company.

#### Cons:
- A lot of females tend to leave our company.
- No strong relations between our dataset columns (not a business related problem)
- Most employees that live in Pune leave (more than 50% of employees living there)
- More people at age from 22.5 to 26 leave the company in payment tier 2 than any other tier.
- In 2018 most of the employees left.
- Most Employees who get benched leave ( approximately 45%).
- Employees with 2 years of experience in their domain are the most to leave.

#### Business Solution:
- Make the work enviroment more suitable for females to work in.
- Supply employees from Pune more means of transportation.
- Supply employees from Bnagalore more means of transportation.
- Try encourage employees aged from 22 till 26 to stay by rewarding them with bonuses or ranking them up in Payment tiers.
- Review what went wrong in 2018 because nearly all employees left the company. (CRITICAL PROBLEM)
- Try to lower down employee benching rate by rotating more employees in projects.
- Encourage employees with 2 years of experience of stay by invloving them in more projects to gain more experience and also get paid more.

# Modeling

### Split the data

In [None]:
X = df.drop('LeaveOrNot' , axis = 1)
y = df['LeaveOrNot']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=101)

### Standardization

In [None]:
scaler = StandardScaler()
ss_x_train = scaler.fit_transform(X_train)
ss_x_test  = scaler.transform(X_test)

In [None]:
def Needed_Metrics (model):
    y_pred = model.predict(ss_x_test)
    print('/n')
    print(classification_report(y_test,y_pred))
    print('/n')
    plot_confusion_matrix(model,ss_x_test,y_test)
    plot_roc_curve(model,ss_x_test,y_test)
    plot_precision_recall_curve(model,ss_x_test,y_test)

### KNN Algorithm 

In [None]:
## now we are going to investigate what is the best K value , so we are going to make fore loop to get the best K value : 

test_error_rate = []

for k in range(1,30):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(ss_x_train , y_train)
    
    y_pred = knn_model.predict(ss_x_test)
    
    error_rate = 1 - accuracy_score(y_test,y_pred)
    
    test_error_rate.append(error_rate)
plt.figure(figsize=(6,3),dpi = 150)
plt.plot(range(1,30), test_error_rate)
plt.xlabel('K Value')
plt.ylabel('Test Error Rate')
plt.title('Choose the best K Value')

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=15)
knn_model.fit(ss_x_train,y_train)

In [None]:
Needed_Metrics(knn_model)

In [None]:
knn_model.score(ss_x_train,y_train)

In [None]:
knn_model.score(ss_x_test,y_test)

### SVM Algorithm

In [None]:
param_grid = {"C":[0.001,0.01,1]}
svc = SVC()
grid_model = GridSearchCV(svc,param_grid)
grid_model.fit(ss_x_train,y_train)

In [None]:
grid_model.best_params_


In [None]:
Needed_Metrics(grid_model)