In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

In [2]:
X_train = pd.read_csv("orange_small_churn_data.train")
y_train = pd.read_csv("orange_small_churn_labels.train", header=None, names='y')
y_train.y = y_train.y.map({-1: 0, 1: 1}).values

In [3]:
X, _, y, _ = train_test_split(X_train, y_train.values, test_size=0.3, 
                              stratify=y_train.values, random_state=17)

In [4]:
X_real = X.dropna(axis='columns', thresh=.8*len(X) ).select_dtypes(['int', 'float'])
impute = SimpleImputer(missing_values=np.nan, strategy='median', copy=False)
impute.fit(X_real)
real_imputer = pd.DataFrame(impute.transform(X_real),columns=X_real.columns)

In [5]:
scaler = MinMaxScaler()
X_real_scaled = scaler.fit_transform(real_imputer)

In [6]:
X_cats = X.dropna(axis='columns', how='any' ).select_dtypes('object')
cat_cols = [i for i in X_cats.columns if X[i].nunique() < 30]
X_train = np.hstack( [X_real_scaled, pd.get_dummies(X_cats[cat_cols]).values] )

In [7]:
y.shape, X_train.shape

((28000L, 1L), (28000L, 150L))

In [8]:
def runModel(model, X):
    np.random.seed(17)
    kf = StratifiedKFold(n_splits=5, shuffle=True)

    cv_score =[]
    cv_f1 = []
    i=1
    for train_index,test_index in kf.split(X, y.ravel()):
        print('{} of KFold {}'.format(i, kf.n_splits))
        xtr,xvl = X_train[train_index], X_train[test_index]
        ytr,yvl = y.ravel()[train_index], y.ravel()[test_index]

        #model
        md = model
        md.fit(xtr,ytr)
        score = roc_auc_score( yvl, md.predict(xvl) )
        f1 = f1_score( yvl, md.predict(xvl) )
        print('ROC AUC score: %.3f, f1-score: %.3f' % (score, f1) )
        cv_score.append(score)
        cv_f1.append(f1)
        i+=1
    print('\nAverage:\n ROC AUC - %.2f,\n f1 - %.2f' % (np.mean(cv_score), np.mean(cv_f1)))

# Линейная модель

In [9]:
runModel( model=LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=500), X=X_train )

1 of KFold 5
ROC AUC score: 0.598, f1-score: 0.179
2 of KFold 5
ROC AUC score: 0.581, f1-score: 0.170
3 of KFold 5
ROC AUC score: 0.603, f1-score: 0.181
4 of KFold 5
ROC AUC score: 0.593, f1-score: 0.176
5 of KFold 5
ROC AUC score: 0.614, f1-score: 0.186

Average:
 ROC AUC - 0.60,
 f1 - 0.18


# RandomForest

In [10]:
df = X.dropna(axis='columns', thresh=.8*len(X) )

In [11]:
to_impute = df.select_dtypes(['int', 'float'])
impute = SimpleImputer(missing_values=np.nan, strategy='median', copy=False)
impute.fit(to_impute)
imputered = pd.DataFrame(impute.transform(to_impute),columns=to_impute.columns)

In [12]:
cat_wood = [i for i in df.select_dtypes('object') if df[i].nunique() < 500]
df_cat = df[cat_wood].fillna('unknown')

le = LabelEncoder()
df_cat = df_cat.apply(le.fit_transform)

In [13]:
df_all = np.hstack([imputered, df_cat] )
df_all.shape, y.shape

((28000L, 58L), (28000L, 1L))

In [14]:
runModel(model=RandomForestClassifier(), X=df_all)

1 of KFold 5
ROC AUC score: 0.501, f1-score: 0.005
2 of KFold 5
ROC AUC score: 0.503, f1-score: 0.014
3 of KFold 5
ROC AUC score: 0.499, f1-score: 0.000
4 of KFold 5
ROC AUC score: 0.501, f1-score: 0.005
5 of KFold 5
ROC AUC score: 0.500, f1-score: 0.000

Average:
 ROC AUC - 0.50,
 f1 - 0.00


# xgboost

In [15]:
import xgboost as xgb

In [16]:
runModel(model=xgb.XGBClassifier(), X=df_all)

1 of KFold 5
ROC AUC score: 0.500, f1-score: 0.000
2 of KFold 5
ROC AUC score: 0.500, f1-score: 0.000
3 of KFold 5
ROC AUC score: 0.500, f1-score: 0.000
4 of KFold 5
ROC AUC score: 0.500, f1-score: 0.000
5 of KFold 5
ROC AUC score: 0.500, f1-score: 0.000

Average:
 ROC AUC - 0.50,
 f1 - 0.00
