In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
import pickle

# Load dataset
data = pd.read_csv('HR_comma_sep.csv')

In [2]:
# Preprocessing
X = data.drop(['left', 'Department'], axis=1)  # Dropping 'left' and 'Department' columns
y = data['left']

# Convert salary column to numerical
salary_mapping = {'low': 0, 'medium': 1, 'high': 2}
X['salary'] = X['salary'].map(salary_mapping)

In [3]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [4]:
print(X_train)

[[ 1.55539086 -0.61934644 -0.65341156 ... -0.41100307 -0.14998935
  -0.93520344]
 [-1.81839394  0.37200735  0.15695385 ... -0.41100307 -0.14998935
   2.2076354 ]
 [ 0.75210876  0.48863721 -0.65341156 ... -0.41100307 -0.14998935
   0.63621598]
 ...
 [ 0.18981129  0.08043271  0.96731926 ... -0.41100307 -0.14998935
   0.63621598]
 [ 0.71194466  1.65493579  0.15695385 ... -0.41100307 -0.14998935
  -0.93520344]
 [ 1.47506265  0.83852678 -1.46377697 ... -0.41100307 -0.14998935
  -0.93520344]]


In [5]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Save the model
with open('model.pkl', 'wb') as file:
    pickle.dump(classifier, file)

In [6]:
# Save the scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(sc, file)


In [7]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB
