# Import libraries

In [1]:
import pandas as pd
from utils.functions_python import *

# Import data

In [2]:
# import train and test data
train = pd.read_csv("train.csv", sep = ",", encoding = "utf-8" )
test = pd.read_csv("test.csv", sep = ",", encoding = "utf-8")

# Data format 

In [3]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
train.dtypes

id                   int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited               int64
dtype: object

In [5]:
# removing features that don't serve any purpose
train = train.drop(["id", "Surname"], axis = 1)

In [6]:
train = data_basic_treatment(train)

In [7]:
train = data_types(train)

In [8]:
train.dtypes

customerid          object
creditscore          int32
geography           object
gender              object
age                  int32
tenure               int32
balance            float64
numofproducts        int32
hascrcard            int32
isactivemember       int32
estimatedsalary    float64
exited               int32
dtype: object

In [9]:
train.head()

Unnamed: 0,customerid,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,15674932,668,france,male,33,3,0.0,2,1,0,181449.97,0
1,15749177,627,france,male,33,1,0.0,2,1,1,49503.5,0
2,15694510,678,france,male,40,10,0.0,2,1,0,184866.69,0
3,15741417,581,france,male,34,2,148882.54,1,1,1,84560.88,0
4,15766172,716,spain,male,33,5,0.0,2,1,1,15068.83,0


# Exploratory Data Analysis

In [10]:
# visualizing number of uniques, missing percent and total number of records
basic_eda(train)

Unnamed: 0,feature,nunique_values,missing_percent,ntotal_values
0,customerid,23221,0.0,165034
1,creditscore,457,0.0,165034
2,geography,3,0.0,165034
3,gender,2,0.0,165034
4,age,69,0.0,165034
5,tenure,11,0.0,165034
6,balance,30075,0.0,165034
7,numofproducts,4,0.0,165034
8,hascrcard,2,0.0,165034
9,isactivemember,2,0.0,165034


In [11]:
# visualizing percentiles and mean of each feature
train[['age', 'balance', 'numofproducts', 'estimatedsalary']].describe()

Unnamed: 0,age,balance,numofproducts,estimatedsalary
count,165034.0,165034.0,165034.0,165034.0
mean,38.125883,55478.086689,1.554455,112574.822734
std,8.867207,62817.663278,0.547154,50292.865585
min,18.0,0.0,1.0,11.58
25%,32.0,0.0,1.0,74637.57
50%,37.0,0.0,2.0,117948.0
75%,42.0,119939.5175,2.0,155152.4675
max,92.0,250898.09,4.0,199992.48


## create some visualizations to explore deeper!

# Data modeling

In [12]:
# one-hot encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# columns to encoder
onehot_columns = ['geography', 'gender', 'hascrcard', 'isactivemember']

# creating encoder and fit to train data
encoder = OneHotEncoder()
encoder.fit(train[onehot_columns])

# Fit and transform data only to needed columns
transform = ColumnTransformer(transformers = [('onehot', encoder, onehot_columns)],
                             remainder = 'passthrough')
train_encoded = transform.fit_transform(train)

# create a DataFrame with encoded data
encoded_columns = encoder.get_feature_names_out()
train_encoded = pd.DataFrame(train_encoded, columns = list(encoded_columns) + list(train.columns.drop(onehot_columns)))

In [13]:
train_encoded

Unnamed: 0,geography_france,geography_germany,geography_spain,gender_female,gender_male,hascrcard_0,hascrcard_1,isactivemember_0,isactivemember_1,customerid,creditscore,age,tenure,balance,numofproducts,estimatedsalary,exited
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,15674932,668,33,3,0.0,2,181449.97,0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,15749177,627,33,1,0.0,2,49503.5,0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,15694510,678,40,10,0.0,2,184866.69,0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,15741417,581,34,2,148882.54,1,84560.88,0
4,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,15766172,716,33,5,0.0,2,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,15667085,667,33,2,0.0,1,131834.75,0
165030,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,15665521,792,35,3,0.0,1,131834.45,0
165031,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,15664752,565,31,5,0.0,1,127429.56,0
165032,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,15689614,554,30,7,161533.0,1,71173.03,0


In [14]:
from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'xgboost'