# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import datasets
from sklearn.metrics import accuracy_score

In [None]:
EA_dataset=pd.read_csv("Employee-attrition.csv")

In [None]:
EA_dataset

In [None]:
#EA_dataset.info

In [None]:
EA_dataset.shape

# Data pre-processing

In [None]:
EA_dataset.drop(0,inplace=True)
EA_dataset.isnull().sum()

In [None]:
#Mean, Median----> missing null values
# Dropping all the records of null values---> dropna()

EA_dataset.dropna(axis=0,inplace=True)
EA_dataset

In [None]:
#Check duplicate records and simply have the unique records to be included into your dataset:

EA_dataset[EA_dataset.duplicated()]

In [None]:
## If we have duplicates records, removing duplicates

EA_dataset.drop_duplicates(keep='first')

In [None]:
attrition_count=pd.DataFrame(EA_dataset['Attrition'].value_counts())
attrition_count

In [None]:
plt.pie(attrition_count['Attrition'], labels=['No', 'Yes'], explode=(0.2,0))

In [None]:
sns.countplot(EA_dataset['Attrition'])

In [None]:
EA_dataset.drop(['EmployeeCount', 'EmployeeNumber'], axis=1)

In [None]:
attrition_dummies=pd.get_dummies(EA_dataset['Attrition'])
attrition_dummies.head()

In [None]:
EA_dataset = pd.concat([EA_dataset, attrition_dummies], axis=1)
EA_dataset.head()

In [None]:
EA_dataset=EA_dataset.drop(['Attrition', 'No'], axis=1)
EA_dataset.head()

In [None]:
## Corr matrix

EA_dataset.corr()

# -1 to +1

# +ve corr: relationship of variable is moving in same direction.

#-ve corr: relationship existing in inverse sort of phase, value of one's is increaing: another s decresing. vice-versa

# no corr: no existing relationship between 2 sort of variables

# Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.subplots(figsize=(12,4))
sns.countplot(x='Age', data=EA_dataset, hue='Yes')

In [None]:
## Data Visualization, analysing different features:

ftr=['BusinessTravel','Department','EducationField', 'EnvironmentSatisfaction', 'Gender','JobRole','MaritalStatus','OverTime','PerformanceRating','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion']
ftr

In [None]:
fig=plt.subplots(figsize=(10,25))

for p,q in enumerate(ftr):
    plt.subplot(6, 2, p+1)
    plt.subplots_adjust(hspace=1.0)
    sns.countplot(x=q, data=EA_dataset, hue='Yes')
    plt.xticks(rotation=90)

In [None]:
## Understanding-(Reason to be churn out of the company)

##Age--> people mostly with the age 34 or 35 are  currently employed and ppl between the age 29 and 31 are leaving the company.

## Performance Rating having Low----> willl have higher chance to be churn out of the company.

##Employess having Years at company: 2-6 years

## Martial Status--- Single

## Gender-- Male

## Environment Satisfaction-- Having low rating(1,2)--Higher chance to be churn out of the company.

## Department-- Research & Development

In [None]:
## Print all of the datatypes and their unique values-

for column in EA_dataset.columns:
    if EA_dataset[column].dtype==object:
        print(str(column) + ':' + str(EA_dataset[column].unique()))
        print(EA_dataset[column].value_counts())
        print('__________________________________________________________')

In [None]:
# Remove some useless columns-

EA_dataset= EA_dataset.drop('Over18', axis=1)
EA_dataset= EA_dataset.drop('EmployeeNumber', axis=1)
EA_dataset= EA_dataset.drop('StandardHours', axis=1)
EA_dataset= EA_dataset.drop('EmployeeCount', axis=1)
EA_dataset.head()

In [None]:
fig,ax=plt.subplots(figsize=(15,10))
sns.heatmap(EA_dataset.corr(), annot=True, linewidths=0.8, fmt=".2f")

In [None]:
# Transform the Data
# Transform non-numerical into numerical col
from sklearn.preprocessing import LabelEncoder

for column in EA_dataset.columns:
    if EA_dataset[column].dtype == np.number:
        continue
    EA_dataset[column] = LabelEncoder().fit_transform(EA_dataset[column])

In [None]:
## Split The Data-
X= EA_dataset.iloc[:, 1:EA_dataset.shape[1]].values
Y= EA_dataset.iloc[:, 0].values

## Splitting Data- Train Test Split
##### Split the data into 75% training and 25% test data-

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.25, random_state=0)

## Model Development-
### Use the Random Forest

In [None]:
##Use the Random Forest -
from sklearn.ensemble import RandomForestClassifier
forest= RandomForestClassifier(n_estimators= 10, criterion= 'entropy', random_state= 0)
forest.fit(X_train, Y_train)
prediction=forest.predict(X_train)
Y_pred=forest.predict(X_test)
score1=accuracy_score(Y_train,prediction)
score=accuracy_score(Y_test,Y_pred)
print(score1)
print(score)

In [None]:
## Get the accuracy on the Train Dataset-
forest.score(X_train, Y_train)

### Implementing Linear Regression

In [None]:
##Importing linear regression and metric mean absolute error -
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_absolute_error as mae

In [None]:
#Creating instance of LR
lr= LR()

#Fitting the model-
lr.fit(X_train, Y_train)

In [None]:
#Predicting over the train set and calculating error

train_predict= lr.predict(X_train)
k=mae(train_predict, Y_train)
print('Training mean absolute error', k)

In [None]:
#Predicting over the test set and calculating error

test_predict= lr.predict(X_test)
k=mae(test_predict, Y_test)
print('Testing mean absolute error', k)

In [None]:
lr.coef_

# KNN Classifier

In [None]:
###Importing KNN Classifier and metric F1 Score-

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import f1_score

In [None]:
## Creating instance of KNN

clf= KNN(n_neighbors=2)

#Fitting the model-
clf.fit(X_train, Y_train)

##Predicting over the train set and calculate f1 score-
train_predict= clf.predict(X_train)
k=f1_score(train_predict, Y_train)
print('Training F1 Score', k)

##Predicting over the test set and calculate f1 score-
test_predict= clf.predict(X_test)
k1=f1_score(test_predict, Y_test)
print('Testing F1 Score', k1)

# Comparing scores of different Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


In [None]:
lr=LogisticRegression(C = 0.1, random_state = 42, solver = 'liblinear')
dt=DecisionTreeClassifier()
rm=RandomForestClassifier()
knn = KNeighborsClassifier(n_neighbors=3)


In [None]:
for a,b in zip([lr,dt,knn,rm],["Logistic Regression","Decision Tree","KNN","Random Forest"]):
    a.fit(X_train,Y_train)
    prediction=a.predict(X_train)
    Y_pred=a.predict(X_test)
    score1=accuracy_score(Y_train,prediction)
    score=accuracy_score(Y_test,Y_pred)
    msg1="[%s] training data accuracy is : %f" % (b,score1)
    msg2="[%s] test data accuracy is : %f" % (b,score)
    print(msg1)
    print(msg2)

In [None]:
model_scores={'Logistic Regression':lr.score(X_test,Y_test),
             'KNN classifier':knn.score(X_test,Y_test),
            
             'Random forest':rm.score(X_test,Y_test),
              'Decision tree':dt.score(X_test,Y_test),
          
             }
model_scores