# Problem Statement and Business Case

The HR team collected extensive data on their employees and approached to develop a model that could predict which employees are more likely to quit. 
The team provided you with an extensive data, here's a sample of the dataset: 
<strong>
1. Age
2. BusinessTravel
3. DailyRate
4. Department
5. DistanceFromHome
6. Education
7. EducationField
8. EmployeeCount
9. EmployeeNumber
10. EnvironmentSatisfaction
11. Gender
12.  HourlyRate
13. JobInvolvement
14. JobLevel
15. JobRole
16. JobSatisfaction
17. MaritalStatus
18. MonthlyRate
19. NumCompaniesWorked 
20. Over18
21. OverTime
22.  PercentSalaryHike
23. PerformanceRating
24. Attrition
25. RelationshipSatisfaction 
26. StandardHours
27. StockOptionLevel
28. TotalWorkingYears
29. TrainingTimesLastYear   
30. WorkLifeBalance
31. YearsAtCompany 
32.  YearsInCurrentRole
33. YearsSinceLastPromotion 
34. YearsWithCurrManager



<strong>

From EDA we concluded to use the following columns only:
    1. Daily Rate
    2. Distance From Home
    3. Environment Satisfaction
    4. Job Level
    5. Job Satisfaction
    6. Monthly Rate
    7. Num Companies Worked
    8. Years at Comapny
    9. Years with manager
    10. BusinessTravel
    11. Department
    12. EducationField
    13 Gender
    14. JobRole
    15. MaritalStatus
    16. Over Time
    17. Age
    
Working on the Sacling and the Categorical VAriables part.

# Importing the Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,recall_score
import keras
import tensorflow as tf

In [None]:
df= pd.read_csv('Human_Resources.csv')

In [None]:
df.columns

In [None]:
df=df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)

In [None]:
df.columns

In [None]:
# Making seperate list for categorical variables
categorical_col = []
for column in new_df.columns:
    if new_df[column].dtype == object:
        categorical_col.append(column)
        print(f"{column} : {df[column].unique()}")
        print("====================================")

In [None]:
df['Attrition'] = df.Attrition.astype("category").cat.codes

In [None]:
df.Attrition.value_counts()

### Data PreProcessing

#### Handling the Categorical Variables using Label Encoder

In [None]:
categorical_col.remove('Attrition')

In [None]:
# Transform categorical data into dummies
# categorical_col.remove("Attrition")
# data = pd.get_dummies(df, columns=categorical_col)
# data.info()
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
for column in categorical_col:
    df[column] = label.fit_transform(df[column])

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

x = df.drop('Attrition', axis=1)
y = df.Attrition

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42)

## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()

In [None]:
xtrain_scaled= pd.DataFrame(sc.fit_transform(x_train))

In [None]:
xtest_scaled= pd.DataFrame(sc.fit_transform(x_test))

In [None]:
# put df_scaled have the columns of original new_df set
xtrain_scaled.columns = new_df.columns.values
xtest_scaled.columns = new_df.columns.values

# take the indexes also
xtrain_scaled.index = new_df.index.values
xtest_scaled.index = new_df.index.values


# Applying machine learning algorithms

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, xtrain_scaled, y_train, xtest_scaled, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf_LR = LogisticRegression(random_state=0)

clf_LR.fit(x_train,y_train)
print_score(clf_LR, xtrain_scaled, y_train, xtest_scaled, y_test, train=True)
print_score(clf_LR, xtrain_scaled, y_train, xtest_scaled, y_test, train=False)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf_tree= DecisionTreeClassifier(random_state=0)

clf_tree.fit(x_train,y_train)
print_score(clf_tree, xtrain_scaled, y_train, xtest_scaled, y_test, train=True)
print_score(clf_tree, xtrain_scaled, y_train, xtest_scaled, y_test, train=False)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_rf= RandomForestClassifier(random_state=0)

clf_rf.fit(x_train,y_train)
print_score(clf_rf, xtrain_scaled, y_train, xtest_scaled, y_test, train=True)
print_score(clf_rf, xtrain_scaled, y_train, xtest_scaled, y_test, train=False)

# Hyper Parameter tuning on Tree Algo

In [None]:
arameters= {'max_depth':[3, None],
             'min_samples_split':[2,5,10],
             'min_samples_leaf':[1,5,10],
             'bootstrap':[True,False],
             'criterion':['entropy']}

In [None]:
rom sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator =clf1, param_grid=parameters,
                          scoring='accuracy',
                          cv=10,
                          n_jobs=-1)
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_
print(f"Best paramters: {best_params})")

In [None]:
clf1= RandomForestClassifier(**best_params)
clf1.fit(x_train,y_train)
print_score(clf1, xtrain_scaled, y_train, xtest_scaled, y_test, train=True)
print_score(clf1, xtrain_scaled, y_train, xtest_scaled, y_test, train=False)

# Feature Selection for LR algo

# Working on Balanicing Dataset