# Libraries

In [252]:
import pandas as pd
import numpy as np
import math

# Preprocessing

In [253]:
data = pd.read_csv("/content/Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [254]:
Y = data['Exited'].astype(float).values
df = data.iloc[:,3:13] #droping the columns that dont help in training 0: RowNumber ; 1:CustomerId ; 2:Surname
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.00,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.80,3,1,0,113931.57
3,699,France,Female,39,1,0.00,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77
9997,709,France,Female,36,7,0.00,1,0,1,42085.58
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52


In [255]:
string = ''
for country in df['Geography'].values:
  if country in string:
    pass
  else:
    string += f"{country},"
string

'France,Spain,Germany,'

In [256]:
df2 = pd.DataFrame.copy(df)
df2['Gender'].replace({'Male':0, 'Female': 1}, inplace=True)
df2['Geography'].replace({'France':0,'Spain':1,'Germany':2},inplace=True)
df2 = df2.astype({'CreditScore':float,'Geography':float,'Gender':float,'Age':float,'Tenure':float,'NumOfProducts':float,'HasCrCard':float,'IsActiveMember':float})
df2.dtypes

CreditScore        float64
Geography          float64
Gender             float64
Age                float64
Tenure             float64
Balance            float64
NumOfProducts      float64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
dtype: object

In [257]:
features = df.values #Features with categorical values
numerical_features = df2.values # Features withougt categorical values

# Logistic Regression

## Model

In [258]:
class LogisticRegression:
  def __init__(self):
      self.coef = 0
      self.intercept = 0
      self.cost_history = 0
      self.coef_history = 0


  def __len__(self,object_):
    return len(object_)

  def sigmoid(self,z):
      return 1/(1+(np.e**(-z)))

  def compute_cost(self,X, y, w, b):
    """
    Computes the cost over all examples
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      y : (ndarray Shape (m,))  target value
      w : (ndarray Shape (n,))  values of parameters of the model
      b : (scalar)              value of bias parameter of the model
    Returns:
      total_cost : (scalar) cost
    """

    m, n = X.shape

    cost = 0
    for i in range(m):
        z_wb = np.dot(X[i],w) + b
        f_z_wb = self.sigmoid(z_wb)
        loss = (-y[i]*np.log(f_z_wb) - (1-y[i])*np.log(1-f_z_wb))
        cost += loss
    total_cost = cost/m

    return total_cost

  def compute_gradient(self,X, y, w, b):
    """
    Computes the gradient for logistic regression

    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      y : (ndarray Shape (m,))  target value
      w : (ndarray Shape (n,))  values of parameters of the model
      b : (scalar)              value of bias parameter of the model
    Returns
      dj_dw : (ndarray Shape (n,)) The gradient of the cost w.r.t. the parameters w.
      dj_db : (scalar)             The gradient of the cost w.r.t. the parameter b.
    """

    m, n = X.shape
    dj_dw = np.zeros(w.shape)
    dj_db = 0.

    for i in range(m):
        z_wb = np.dot(X[i],w) + b
        f_wb = self.sigmoid(z_wb)
        err_i = f_wb - y[i]

        dj_db += err_i

        for j in range(n):
            dj_dw[j] += err_i*X[i][j]

    dj_dw = dj_dw/m
    dj_db = dj_db/m


    return dj_db, dj_dw

  def fit(self,X, y,random_state = 0, cost_function=None, gradient_function=None,max_iter=10000,learn_rate=0.001): #gradient descendent
    """
    Performs batch gradient descent to learn theta. Updates theta by taking
    num_iters gradient steps with learning rate alpha

    Args:
      X :    (ndarray Shape (m, n) data, m examples by n features
      y :    (ndarray Shape (m,))  target value
      w_in : (ndarray Shape (n,))  Initial values of parameters of the model
      b_in : (scalar)              Initial value of parameter of the model
      cost_function :              function to compute cost
      gradient_function :          function to compute gradient
      alpha : (float)              Learning rate
      num_iters : (int)            number of iterations to run gradient descent

    Returns:
      w : (ndarray Shape (n,)) Updated values of parameters of the model after
          running gradient descent
      b : (scalar)                Updated value of parameter of the model after
          running gradient descent
    """

    # number of training examples and features
    m,n = X.shape

    np.random.seed(random_state)

    w_in =(np.random.rand(n))
    b_in = 0

    # An array to store cost J and w's at each iteration primarily for graphing later
    self.cost_history = []
    self.coef_history = []

    for i in range(max_iter):

        # Calculate the gradient and update the parameters
        dj_db, dj_dw = self.compute_gradient(X, y, w_in, b_in)
        # Update Parameters using w, b, alpha(learn_rate) and gradient
        w_in = w_in - learn_rate * dj_dw
        b_in = b_in - learn_rate * dj_db
        # Save cost J at each iteration
        if i<max_iter:      # prevent resource exhaustion
            cost =  self.compute_cost(X, y, w_in, b_in)
            self.cost_history.append(cost)

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i % (max_iter // 10) == 0 or i == (max_iter-1):
            self.coef_history.append(w_in)
            print(f"Iteration {i:4}: Cost {float(self.cost_history[-1]):8.2f}   ")
    self.coef = w_in
    self.intercept = b_in

  def predict(self,X,threshold=0.5):
    """
    Predict whether the label is 0 or 1 using learned logistic
    regression parameters w

    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      w : (ndarray Shape (n,))  values of parameters of the model
      b : (scalar)              value of bias parameter of the model

    Returns:
      p : (ndarray (m,)) The predictions for X using a threshold at 0.5
    """
    w = self.coef
    b = self.intercept

    # number of training examples
    m, n = X.shape
    predict_ = np.zeros(m)

    # Loop over each example
    for i in range(m):
        # Calculate f_wb (exactly how you did it in the compute_cost function above)
        z_wb = 0
        # Loop over each feature
        for j in range(n):
        # Add the corresponding term to z_wb
            z_wb_ij = X[i, j] * w[j]
            z_wb += z_wb_ij

        # Add bias term
        z_wb += b
        # Calculate the prediction from the model
        f_wb = self.sigmoid(z_wb)

        # Apply the threshold.
        predict_[i] =1 if f_wb >= threshold else 0

    return predict_


    def coef(self):
      return self.coef

    def intercept(self):
      return self.intercept

    def cost_hist(self):
      return self.cost_history

    def coef_hist(self):
      return self.coef.history

## Auxiliary Functions

In [259]:
def accuracy(prediction, y_test) -> float:
    yes_sum = sum(1 for i, result in enumerate(prediction) if y_test[i] == result)
    no_sum = len(prediction) - yes_sum
    accuracy_percentage = yes_sum / len(prediction) * 100
    return accuracy_percentage
def normalize_data(data):
    #calcula esperança e desvio padrao
    data_normalized = data.copy()
    for columns in data_normalized.T:
      mean = columns.mean()
      std_deviation = columns.std()
      for i,value in enumerate(columns):
        columns[i] = (value - mean)/std_deviation
    return data_normalized

def split_train_test(x,y,test_size=0.3):
  N_train = len(x)*(1-test_size) ; N_test = len(x)*(1-test_size)
  x_train,y_train,x_test,y_test = [],[],[],[]

  for i,row in enumerate(x):
    if i<N_train:
       x_train.append(row)
       y_train.append(y[i])
    else:
       x_test.append(row)
       y_test.append(y[i])

  return np.array(x_train), np.array(x_test), np.array(y_train), np.array(y_test)

## Train

In [261]:
numerical_features_esc = normalize_data(numerical_features)

In [262]:
x_train, x_test, y_train, y_test = split_train_test(numerical_features_esc, Y, test_size = 0.30)

In [263]:
model = LogisticRegression()
model.fit(x_train,y_train,max_iter=300,learn_rate=0.1)

Iteration    0: Cost     0.99   
Iteration   30: Cost     0.63   
Iteration   60: Cost     0.50   
Iteration   90: Cost     0.46   
Iteration  120: Cost     0.44   
Iteration  150: Cost     0.44   
Iteration  180: Cost     0.43   
Iteration  210: Cost     0.43   
Iteration  240: Cost     0.43   
Iteration  270: Cost     0.43   
Iteration  299: Cost     0.43   


## Test

In [264]:
prevision = model.predict(x_test,threshold=0.5)
prevision

array([0., 0., 0., ..., 0., 0., 0.])

### Analysis in the train set

In [265]:
print(f"Train Accuracy: {accuracy(prevision,y_train):.2f}%")

Train Accuracy: 74.60%


### Analysis in the test set

In [266]:
print(f"Test Accuracy: {accuracy(prevision,y_test):.2f}%")

Test Accuracy: 81.67%
