In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [27]:
x = df.drop(columns=['charges'])
y = df['charges']
from sklearn.model_selection import train_test_split

In [29]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [17]:
numeric = ['age','bmi']
categoric = ['sex','children','smoker','region']

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_transformer = Pipeline(steps=[
    ('Scaler',StandardScaler())
])
categoric_transformer = Pipeline(steps = [
    ('Encoding',OneHotEncoder(handle_unknown = 'ignore',drop='first'))
])

In [35]:
preprocessor = ColumnTransformer(transformers=[
    ('Numeric', numeric_transformer, numeric),
    ('Categoric',categoric_transformer,categoric)
])

In [37]:
preprocessor

In [39]:
x_train_trf = preprocessor.fit_transform(x_train)
x_test_trf = preprocessor.transform(x_test)

# Regression

In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
lr = LinearRegression()
lr.fit(x_train_trf,y_train)
y_pred = lr.predict(x_test_trf)
accuracy = r2_score(y_test,y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 78.26%


In [53]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
model = DecisionTreeRegressor(

    criterion='squared_error',         
    max_depth=4,              
    min_samples_split=5,      
    min_samples_leaf=3,       
    random_state=42
)
model.fit(x_train_trf,y_train)
y_pred = model.predict(x_test_trf)
accuracy = r2_score(y_test,y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 86.59%


In [61]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
model = RandomForestRegressor(
        
    max_depth=4,                     
    random_state=42
)
model.fit(x_train_trf,y_train)
y_pred = model.predict(x_test_trf)
accuracy = r2_score(y_test,y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 86.92%


In [55]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

model = KNeighborsRegressor(n_neighbors=5, weights='uniform')

model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)
accuracy = r2_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 71.15%


In [69]:
from sklearn.svm import SVR
from sklearn.metrics import r2_score

model = SVR(kernel='rbf')

model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)
accuracy = r2_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: -7.11%


In [67]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

model = GradientBoostingRegressor()

model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)
accuracy = r2_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 87.74%


# Classification

In [111]:
df = pd.read_csv("Attrition (1).csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [113]:
print(df.shape)
df.isnull().sum()


(1470, 35)


Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [115]:
cols_to_drop = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']

In [117]:
df = df.drop(columns=cols_to_drop)
df.shape

(1470, 31)

In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EnvironmentSatisfaction   1470 non-null   int64 
 9   Gender                    1470 non-null   object
 10  HourlyRate                1470 non-null   int64 
 11  JobInvolvement            1470 non-null   int64 
 12  JobLevel                  1470 non-null   int64 
 13  JobRole                   1470 non-null   object
 14  JobSatisfaction         

In [129]:
x = df.drop(columns=['Attrition'])
y = df['Attrition']

In [131]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [133]:
numeric_features = [
    'Age',
    'DailyRate',
    'DistanceFromHome',
    'Education',
    'EnvironmentSatisfaction',
    'HourlyRate',
    'JobInvolvement',
    'JobLevel',
    'JobSatisfaction',
    'MonthlyIncome',
    'MonthlyRate',
    'NumCompaniesWorked',
    'PercentSalaryHike',
    'PerformanceRating',
    'RelationshipSatisfaction',
    'StockOptionLevel',
    'TotalWorkingYears',
    'TrainingTimesLastYear',
    'WorkLifeBalance',
    'YearsAtCompany',
    'YearsInCurrentRole',
    'YearsSinceLastPromotion',
    'YearsWithCurrManager'
]

categorical_features = [
    'BusinessTravel',
    'Department',
    'EducationField',
    'Gender',
    'JobRole',
    'MaritalStatus',
    'OverTime'
]


In [135]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [137]:
x_train_trf = preprocessor.fit_transform(x_train)
x_test_trf = preprocessor.transform(x_test)

In [146]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [154]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)

print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%)")


Logistic Regression Accuracy: 88.78%)


In [156]:
model = DecisionTreeClassifier(criterion='entropy', random_state=42)
model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)

print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")

Decision Tree Accuracy: 77.89%


In [158]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)

print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")

Random Forest Accuracy: 87.76%


In [160]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)

print(f"KNN Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")

KNN Accuracy: 87.07%


In [162]:
model = SVC(kernel='rbf', probability=True)
model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)

print(f"SVM Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")

SVM Accuracy: 88.78%


In [164]:
model = GaussianNB()
model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)

print(f"Naive Bayes Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")


Naive Bayes Accuracy: 69.05%


In [170]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()
model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)

print(f"AdaBoost Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")


AdaBoost Accuracy: 87.76%




In [172]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42)
model.fit(x_train_trf, y_train)
y_pred = model.predict(x_test_trf)

print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")


Gradient Boosting Accuracy: 87.07%
