In [232]:
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import pandas as pd

In [233]:
df = pd.read_csv('dataset/telcom_customer_churn.csv')
df.head()

Unnamed: 0,rev_Mean,mou_Mean,totmrc_Mean,da_Mean,ovrmou_Mean,ovrrev_Mean,vceovr_Mean,datovr_Mean,roam_Mean,change_mou,...,forgntvl,ethnic,kid0_2,kid3_5,kid6_10,kid11_15,kid16_17,creditcd,eqpdays,Customer_ID
0,23.9975,219.25,22.5,0.2475,0.0,0.0,0.0,0.0,0.0,-157.25,...,0.0,N,U,U,U,U,U,Y,361.0,1000001
1,57.4925,482.75,37.425,0.2475,22.75,9.1,9.1,0.0,0.0,532.25,...,0.0,Z,U,U,U,U,U,Y,240.0,1000002
2,16.99,10.25,16.99,0.0,0.0,0.0,0.0,0.0,0.0,-4.25,...,0.0,N,U,Y,U,U,U,Y,1504.0,1000003
3,38.0,7.5,38.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,...,0.0,U,Y,U,U,U,U,Y,1812.0,1000004
4,55.23,570.5,71.98,0.0,0.0,0.0,0.0,0.0,0.0,38.5,...,0.0,I,U,U,U,U,U,Y,434.0,1000005


In [234]:
y = df['churn']
df.drop(['churn'], axis=1, inplace=True)

In [235]:
nans = df.isnull().sum()

In [236]:
len(nans)

99

In [237]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 99 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   rev_Mean          99643 non-null   float64
 1   mou_Mean          99643 non-null   float64
 2   totmrc_Mean       99643 non-null   float64
 3   da_Mean           99643 non-null   float64
 4   ovrmou_Mean       99643 non-null   float64
 5   ovrrev_Mean       99643 non-null   float64
 6   vceovr_Mean       99643 non-null   float64
 7   datovr_Mean       99643 non-null   float64
 8   roam_Mean         99643 non-null   float64
 9   change_mou        99109 non-null   float64
 10  change_rev        99109 non-null   float64
 11  drop_vce_Mean     100000 non-null  float64
 12  drop_dat_Mean     100000 non-null  float64
 13  blck_vce_Mean     100000 non-null  float64
 14  blck_dat_Mean     100000 non-null  float64
 15  unan_vce_Mean     100000 non-null  float64
 16  unan_dat_Mean     100

In [238]:
nans = nans[nans != 0].index

In [239]:
nans

Index(['rev_Mean', 'mou_Mean', 'totmrc_Mean', 'da_Mean', 'ovrmou_Mean',
       'ovrrev_Mean', 'vceovr_Mean', 'datovr_Mean', 'roam_Mean', 'change_mou',
       'change_rev', 'avg6mou', 'avg6qty', 'avg6rev', 'prizm_social_one',
       'area', 'dualband', 'refurb_new', 'hnd_price', 'phones', 'models',
       'hnd_webcap', 'truck', 'rv', 'ownrent', 'lor', 'dwlltype', 'marital',
       'adults', 'infobase', 'income', 'numbcars', 'HHstatin', 'dwllsize',
       'forgntvl', 'ethnic', 'kid0_2', 'kid3_5', 'kid6_10', 'kid11_15',
       'kid16_17', 'creditcd', 'eqpdays'],
      dtype='object')

In [240]:
df_nans = df[nans]
df_nans.head(5)

Unnamed: 0,rev_Mean,mou_Mean,totmrc_Mean,da_Mean,ovrmou_Mean,ovrrev_Mean,vceovr_Mean,datovr_Mean,roam_Mean,change_mou,...,dwllsize,forgntvl,ethnic,kid0_2,kid3_5,kid6_10,kid11_15,kid16_17,creditcd,eqpdays
0,23.9975,219.25,22.5,0.2475,0.0,0.0,0.0,0.0,0.0,-157.25,...,A,0.0,N,U,U,U,U,U,Y,361.0
1,57.4925,482.75,37.425,0.2475,22.75,9.1,9.1,0.0,0.0,532.25,...,A,0.0,Z,U,U,U,U,U,Y,240.0
2,16.99,10.25,16.99,0.0,0.0,0.0,0.0,0.0,0.0,-4.25,...,A,0.0,N,U,Y,U,U,U,Y,1504.0
3,38.0,7.5,38.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,...,D,0.0,U,Y,U,U,U,U,Y,1812.0
4,55.23,570.5,71.98,0.0,0.0,0.0,0.0,0.0,0.0,38.5,...,O,0.0,I,U,U,U,U,U,Y,434.0


In [241]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

num_cols = df.select_dtypes(include=numerics).columns
cat_cols = df.select_dtypes('object').columns

In [242]:
num_feature = []
cat_feature = []
for feature in nans:
    if feature not in cat_cols:
        num_feature.append(feature)
    else:
        cat_feature.append(feature)

In [243]:
def impute_with_median(data, NA_col):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i] = data_copy[i].fillna(data[i].median())

    return data_copy

In [244]:
def impute_with_mode(data, NA_col):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i] = data_copy[i].fillna(data[i].mode()[0])

    return data_copy

In [245]:
new_df = impute_with_median(data = df, NA_col = num_feature)

In [246]:
new_df2 = impute_with_mode(data = new_df, NA_col = cat_feature)

In [247]:
# creating instance of one-hot-encoder
encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')

new_df3 = pd.DataFrame(encoder.fit_transform(new_df2[cat_cols]).toarray())

In [248]:
new_df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,143,144,145,146,147,148,149,150,151,152
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [249]:
without_cats = []
for col in df.columns:
    if col not in cat_cols:
        without_cats.append(col)

In [250]:
len(without_cats)

78

In [251]:
new_df4 = pd.concat([new_df2[without_cats], new_df3], axis=1)

In [252]:
new_df4.head()

Unnamed: 0,rev_Mean,mou_Mean,totmrc_Mean,da_Mean,ovrmou_Mean,ovrrev_Mean,vceovr_Mean,datovr_Mean,roam_Mean,change_mou,...,143,144,145,146,147,148,149,150,151,152
0,23.9975,219.25,22.5,0.2475,0.0,0.0,0.0,0.0,0.0,-157.25,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,57.4925,482.75,37.425,0.2475,22.75,9.1,9.1,0.0,0.0,532.25,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,16.99,10.25,16.99,0.0,0.0,0.0,0.0,0.0,0.0,-4.25,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,38.0,7.5,38.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,55.23,570.5,71.98,0.0,0.0,0.0,0.0,0.0,0.0,38.5,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [253]:
X_train, X_test, y_train, y_test = train_test_split(new_df4, y, test_size=0.25, random_state=42)

In [254]:
scaler = preprocessing.StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [256]:
xgb = XGBClassifier(n_estimators=200)

In [257]:
xgb.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [258]:
y_pred = xgb.predict_proba(X_test)

In [259]:
roc_auc_score(y_test, y_pred[:, 1])

0.6782783471080895

Вывод: по метрике (0.67) результат выше 0.5, следовательно, модель чему-то научилась, однако значение скорее неудовлетворительное.