In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e1/sample_submission.csv
/kaggle/input/playground-series-s4e1/train.csv
/kaggle/input/playground-series-s4e1/test.csv


In [2]:
from pathlib import Path
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import roc_auc_score



In [3]:
data=pd.read_csv("/kaggle/input/playground-series-s4e1/train.csv",index_col="id")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 165034 entries, 0 to 165033
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   CustomerId       165034 non-null  int64  
 1   Surname          165034 non-null  object 
 2   CreditScore      165034 non-null  int64  
 3   Geography        165034 non-null  object 
 4   Gender           165034 non-null  object 
 5   Age              165034 non-null  float64
 6   Tenure           165034 non-null  int64  
 7   Balance          165034 non-null  float64
 8   NumOfProducts    165034 non-null  int64  
 9   HasCrCard        165034 non-null  float64
 10  IsActiveMember   165034 non-null  float64
 11  EstimatedSalary  165034 non-null  float64
 12  Exited           165034 non-null  int64  
dtypes: float64(5), int64(5), object(3)
memory usage: 17.6+ MB


In [5]:
data['Exited'].value_counts()

Exited
0    130113
1     34921
Name: count, dtype: int64

In [6]:
def load_data():
    # Read data
    df_train = pd.read_csv("/kaggle/input/playground-series-s4e1/train.csv",index_col="id")
    df_test = pd.read_csv("/kaggle/input/playground-series-s4e1/test.csv",index_col="id")
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test])
    # Preprocessing
    df = encode(df)
    # Reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]
    return df_train, df_test

In [7]:
def encode(df):
    df = pd.get_dummies(df, columns=['Geography'],dtype=int)
    df['Is_male'] = df['Gender'].map( {'Female': 0, 'Male': 1} ).astype(int)
    df.drop(columns=['Surname','Gender'],inplace=True)
    return df

In [8]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


In [9]:
X = data.copy()
y = X.pop("Exited")

mi_scores = make_mi_scores(X, y)
mi_scores


NumOfProducts      0.092622
Age                0.081648
Surname            0.022402
IsActiveMember     0.021661
CustomerId         0.021527
Geography          0.020957
Balance            0.013803
Gender             0.013358
EstimatedSalary    0.007398
CreditScore        0.001659
Tenure             0.000000
HasCrCard          0.000000
Name: MI Scores, dtype: float64

In [10]:
data[['HasCrCard','EstimatedSalary']].groupby(data['Exited']).agg("count")

Unnamed: 0_level_0,HasCrCard,EstimatedSalary
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1
0,130113,130113
1,34921,34921


In [11]:
df_train, df_test = load_data()


In [12]:
df_train.index

Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
            9,
       ...
       165024, 165025, 165026, 165027, 165028, 165029, 165030, 165031, 165032,
       165033],
      dtype='int64', name='id', length=165034)

In [13]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, cross_val_score

def score_dataset(X, y, model):
    # Label encoding for categoricals
    #
    # Label encoding is good for XGBoost and RandomForest, but one-hot
    # would be better for models like Lasso or Ridge. The `cat.codes`
    # attribute holds the category levels.
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    #log_y = np.log(y)
    score = cross_val_score(
        model, X, y, cv=10)
    score = score.mean()
    #score = np.sqrt(score)
    return score


In [14]:
X = df_train.copy()
y = X.pop("Exited")


In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
baseline_score = score_dataset(X_train, y_train,XGBClassifier())
print(f"Baseline score: {baseline_score:.5f} RMSLE")


Baseline score: 0.86350 RMSLE


In [17]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
predictions2=xgb.predict_proba(X_val)[:,1]
#score_dataset(X_val, predictions2, xgb)


In [18]:
predictions2

array([0.1124521 , 0.01143064, 0.75511426, ..., 0.41879767, 0.00991993,
       0.04550116], dtype=float32)

In [19]:
roc_auc = roc_auc_score(y_val, predictions2)

In [20]:
roc_auc

0.887954820878621

In [21]:
df_test.pop("Exited")

id
165034   NaN
165035   NaN
165036   NaN
165037   NaN
165038   NaN
          ..
275052   NaN
275053   NaN
275054   NaN
275055   NaN
275056   NaN
Name: Exited, Length: 110023, dtype: float64

In [22]:
#test_predictions = xgb.predict_proba(df_test)[:, 1]


In [23]:
#submission_df = pd.DataFrame({'id': df_test.index, 'Exited': test_predictions})
#submission_df.to_csv('submission.csv', index=False)


In [24]:
xgb_params = dict(
    max_depth=6,           # maximum depth of each tree - try 2 to 10
    learning_rate=0.01,    # effect of each tree - try 0.0001 to 0.1
    n_estimators=1000,     # number of trees (that is, boosting rounds) - try 1000 to 8000
    min_child_weight=1,    # minimum number of houses in a leaf - try 1 to 10
    colsample_bytree=0.7,  # fraction of features (columns) per tree - try 0.2 to 1.0
    subsample=0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
    reg_alpha=0.5,         # L1 regularization (like LASSO) - try 0.0 to 10.0
    reg_lambda=1.0,        # L2 regularization (like Ridge) - try 0.0 to 10.0
    num_parallel_tree=1,   # set > 1 for boosted random forests
)

xgb2 = XGBClassifier(**xgb_params)
xgb2.fit(X_train,y_train)
predictions3=xgb2.predict_proba(X_val)[:,1]


In [25]:
roc_auc = roc_auc_score(y_val, predictions3)

In [26]:
roc_auc

0.8915894749239561

In [27]:
test_predictions = xgb2.predict_proba(df_test)[:, 1]

In [28]:
submission_df = pd.DataFrame({'id': df_test.index, 'Exited': test_predictions})
submission_df.to_csv('submission.csv', index=False)
