In [60]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")


### Этап 1. Загрузка данных

In [2]:
df = pd.read_csv("C:/Users/Даниил/Desktop/ОМО/users_behavior.csv")

df.head()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [4]:
X = df[['calls', 'minutes', 'messages', 'mb_used']]
y = df['is_ultra']

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [6]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [7]:
print(lr.score(X_valid, y_valid))

0.7316062176165803


### Stacking (стекинг)

In [8]:
estimators = [('lr', LogisticRegression()), ('dt', DecisionTreeClassifier())]
modelClf = StackingClassifier(estimators=estimators, final_estimator=SVC())

In [9]:
modelClf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
print(modelClf.score(X_valid, y_valid))

0.7378238341968912


### Bagging (бэггинг)

In [64]:
modelClf = BaggingClassifier(base_estimator=LogisticRegression(), n_estimators=50, random_state=12)

In [65]:
modelClf.fit(X_train, y_train)

In [66]:
print(modelClf.score(X_valid, y_valid))

0.7139896373056995


#### Попробуем провести подбор параметров для улучшения качества модели:

In [61]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
modelClf.fit(X_scaled, y_train)

params = {'lr__solver': ['liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga', 'lbfgs'],
          'lr__penalty':['l2'],
          'lr__max_iter': [10000],
          'lr__C': [0.1, 1, 10, 100]}

grid = GridSearchCV(estimator=modelClf, param_grid=params, cv=5, scoring='accuracy')
grid_result = grid.fit(X_train, y_train)

print("parameter ", grid_result.best_params_)

parameter  {'lr__C': 0.1, 'lr__max_iter': 10000, 'lr__penalty': 'l2', 'lr__solver': 'newton-cg'}


In [59]:
print("parameter ", grid_result.best_params_)

parameter  {'lr__C': 1, 'lr__max_iter': 10000, 'lr__penalty': 'l2', 'lr__solver': 'newton-cg'}


In [63]:
modelClf = BaggingClassifier(base_estimator=LogisticRegression(C=1, penalty='l2', solver='newton-cg', max_iter=10000), n_estimators=50, random_state=12)
modelClf.fit(X_train, y_train)
print(modelClf.score(X_valid, y_valid))

0.7305699481865285


После подбора параметров точность увеличилась с 0.713 до 0.730

### Random Forest (случайный лес)

In [14]:
clf = RandomForestClassifier(n_estimators=20, max_depth=6,
                             min_samples_split=2, random_state=0)
scores = cross_val_score(clf, X_train, y_train, cv=3)
scores.mean()

0.8096899569796766

In [15]:
clf.fit(X_train, y_train)

In [16]:
print(clf.score(X_valid, y_valid))

0.783419689119171


In [18]:
best_score = 0
best_params = None

for max_depth in range(2, 10):
    for n_estimators in range(10, 31, 10):
        model_rfr = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=12345)
        score = cross_val_score(model_rfr, X_train, y_train, cv=3, n_jobs=-1).mean()
        if score > best_score:
            best_score = score
            best_params = {'max_depth': max_depth, 'n_estimators': n_estimators}

print('Лучшее значение accuracy для случайного леса: {} при значениях гиперпараметров: {}'.format(best_score, best_params))

Лучшее значение accuracy для случайного леса: 0.8110262572318647 при значениях гиперпараметров: {'max_depth': 6, 'n_estimators': 10}


### Boosting (бустинг)

#### Adaboost (адаптивный бустинг)

In [20]:
modelClf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, random_state=12)

In [22]:
modelClf.fit(X_train, y_train)

In [23]:
print(modelClf.score(X_valid, y_valid))

0.7626943005181347


#### Gradient Boosting (Градиентный бустинг)

In [25]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1,
                                 max_depth=1, random_state=0)

In [27]:
clf.fit(X_train, y_train)

In [28]:
print(clf.score(X_valid, y_valid))

0.7813471502590673


### Voting (Простое усреднение)

In [29]:
decisiontree = DecisionTreeClassifier(max_depth=2)

In [30]:
forest = RandomForestClassifier(n_estimators=20, max_depth=6,
                             min_samples_split=2, random_state=0)

In [31]:
ensemble=VotingClassifier(estimators=[('Decision Tree', decisiontree), ('Random Forest', forest)], 
                       voting='soft', weights=[1,1]).fit(X_train, y_train)

In [33]:
print(ensemble.score(X_valid, y_valid))

0.7689119170984456


### Итоги:

После применения ансамблевых методов получили следующие параметры точности:

**Stacking** - 0.7378238341968912

**Bagging** (с подбором гиперпараметров) - 0.7305699481865285

**Bagging** (без подбора гиперпараметров) - 0.7139896373056995

**Random Forest** - 0.8096899569796766

**Adaboost** - 0.7626943005181347

**Gradient Boosting** - 0.7813471502590673

**Voting** - 0.7689119170984456

Лучшим показателем точности обладает **Random Forest** - 0.8096899569796766
