In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler

from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold

# Загружаю данные

In [3]:
data = pd.read_csv("orange_small_churn_data.train", sep=",")
data["label"] = pd.read_csv("orange_small_churn_labels.train", header=None, names=["label"])

# Отделяю тестовые данные

In [4]:
data_train, data_test = train_test_split(data, test_size=0.1, random_state=42, shuffle=True, stratify=data["label"])

In [5]:
data_test.to_csv("data_test.csv")

In [23]:
data_train.to_csv("data_train.csv")

In [6]:
features_raw = data_train[data_train.columns[:-1]]
label = data_train["label"]

In [22]:
features_raw.shape()

TypeError: 'tuple' object is not callable

# Готовлю признаки

In [7]:
numerical_features = data.columns[:-41]
categorical_features = data.columns[-41:-1]

In [21]:
data.shape()

TypeError: 'tuple' object is not callable

## Отбрасываю пустые признаки (без значений)

In [8]:
empty_features = features_raw.columns[features_raw.nunique() == 0]
print(empty_features)

Index([u'Var8', u'Var15', u'Var20', u'Var31', u'Var32', u'Var39', u'Var42',
       u'Var48', u'Var52', u'Var55', u'Var79', u'Var141', u'Var167', u'Var169',
       u'Var175', u'Var185', u'Var209', u'Var230'],
      dtype='object')


In [9]:
features_raw.drop(columns=empty_features, inplace=True)

numerical_features = numerical_features.drop(labels=empty_features, errors="ignore")
categorical_features = categorical_features.drop(labels=empty_features, errors="ignore")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


## Готовлю pipeline

In [10]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
    ("scaler", RobustScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("one_hot", OneHotEncoder(handle_unknown="ignore"))
])

full_pipeline = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, numerical_features),
        ("categorical", categorical_pipeline, categorical_features)
    ],
    n_jobs=-1
)

## Преобразовываю признаки

In [11]:
features_raw.info(verbose=False, memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36000 entries, 20579 to 13246
Columns: 212 entries, Var1 to Var229
dtypes: float64(173), int64(1), object(38)
memory usage: 105.1 MB


In [12]:
features_transformed = full_pipeline.fit_transform(features_raw)

In [13]:
features_transformed

<36000x59951 sparse matrix of type '<type 'numpy.float64'>'
	with 2539925 stored elements in Compressed Sparse Row format>

# Оцениваю baseline-модели

In [14]:
ridge_clf = RidgeClassifier(random_state=42)
rf_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)

In [15]:
classifiers = {
    "Ridge (Default)": ridge_clf,
    "Random Forest (Default)": rf_clf,
    "Gradient Boosting (Default)": gb_clf
}

In [16]:
cv_strategy = StratifiedKFold(n_splits=10, random_state=42)

In [17]:
scoring_results = pd.DataFrame(
    columns=["ROC-AUC", "Accuracy", "F1 score", "Precision", "Recall"],
    index=classifiers.keys()
)

In [18]:
for classifier_name in classifiers.keys():    
    scores = cross_validate(
        estimator=classifiers[classifier_name],
        X=features_transformed,
        y=label,
        scoring=["roc_auc", "accuracy", "f1", "precision", "recall"],
        cv=cv_strategy,
        n_jobs=-1
    )

    scoring_results.loc[classifier_name] = [
        round(np.mean(scores["test_roc_auc"]), 4),
        round(np.mean(scores["test_accuracy"]), 4),
        round(np.mean(scores["test_f1"]), 4),
        round(np.mean(scores["test_precision"]), 4),
        round(np.mean(scores["test_recall"]), 4)
    ]

In [19]:
scoring_results

Unnamed: 0,ROC-AUC,Accuracy,F1 score,Precision,Recall
Ridge (Default),0.6043,0.9256,0.0,0.0,0.0
Random Forest (Default),0.5739,0.9255,0.0037,0.2167,0.0019
Gradient Boosting (Default),0.7339,0.9248,0.0152,0.2929,0.0078
