In [1]:
#!pip install scikit-learn
#!pip install pandas
#!pip install imblearn

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler


In [3]:
#Load dataset
url = 'https://raw.githubusercontent.com/joneikholmkea/machine-learning/main/csv/customer_staying_or_not.csv'
df = pd.read_csv(url)

In [4]:
#Drop irrelevant columns
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], inplace=True)
df.head(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
#One hot encoding of categorical feautures with drop first and cast them to binaries
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)
df

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.00,1,1,1,101348.88,1,False,False,False
1,608,41,1,83807.86,1,0,1,112542.58,0,False,True,False
2,502,42,8,159660.80,3,1,0,113931.57,1,False,False,False
3,699,39,1,0.00,2,0,0,93826.63,0,False,False,False
4,850,43,2,125510.82,1,1,1,79084.10,0,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,0,False,False,True
9996,516,35,10,57369.61,1,1,1,101699.77,0,False,False,True
9997,709,36,7,0.00,1,0,1,42085.58,1,False,False,False
9998,772,42,3,75075.31,2,1,0,92888.52,1,True,False,True


In [6]:
# Normalize numerical features
num_col = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

scaler = MinMaxScaler()
df[num_col] = scaler.fit_transform(df[num_col])

In [7]:
df

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,0.538,0.324324,0.2,0.000000,0.000000,1,1,0.506735,1,False,False,False
1,0.516,0.310811,0.1,0.334031,0.000000,0,1,0.562709,0,False,True,False
2,0.304,0.324324,0.8,0.636357,0.666667,1,0,0.569654,1,False,False,False
3,0.698,0.283784,0.1,0.000000,0.333333,0,0,0.469120,0,False,False,False
4,1.000,0.337838,0.2,0.500246,0.000000,1,1,0.395400,0,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.842,0.283784,0.5,0.000000,0.333333,1,0,0.481341,0,False,False,True
9996,0.332,0.229730,1.0,0.228657,0.000000,1,1,0.508490,0,False,False,True
9997,0.718,0.243243,0.7,0.000000,0.000000,0,1,0.210390,1,False,False,False
9998,0.844,0.324324,0.3,0.299226,0.333333,1,0,0.464429,1,True,False,True


In [8]:
#Split the data into featues X and  target y
X = df.drop(columns='Exited')
y = df.Exited

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# Initialize and train the logistic regression model with class weights
model = LogisticRegression(class_weight='balanced', max_iter=10000)
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.72      0.80      1607
           1       0.38      0.72      0.50       393

    accuracy                           0.72      2000
   macro avg       0.65      0.72      0.65      2000
weighted avg       0.81      0.72      0.74      2000

