In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import accuracy_score as a, f1_score as f, roc_auc_score as ra
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


In [None]:


tc = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/ML_math_2022/main/%D0%94%D0%BE%D0%BC%D0%B0%D1%88%D0%BA%D0%B8/HW_3/telecom_churn.csv')

sl = {"Churn": {False: 0, True: 1}, "International plan": {"No": 0, "Yes": 1}, "Voice mail plan": {"No": 0, "Yes": 1}}

tc = tc.replace(sl)

In [None]:
X = tc.drop("Churn", axis = 1)

y = tc.replace(sl)["Churn"]
print(1, X.shape)

X1 = pd.get_dummies(X["State"].values)
X = X1.join(X).drop("State", axis = 1)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)

X = pd.DataFrame(scaler.transform(X), columns=X.columns)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size = 0.7, random_state = 42)
print(2, X.shape)

1 (3333, 19)
2 (3333, 69)


In [None]:
pd.DataFrame(tc["Churn"]).value_counts()

Churn
0        2850
1         483
dtype: int64

Так как у нас 1 сильно меньше, чем 0, давайте сразу договоримся, что будем считать относительно метрики f1-score.

In [None]:
X.shape

(3333, 69)

# **Задание A** #

In [None]:
lr = LogisticRegression()
params={"C":[0.01, 0.1, 1, 10, 100]}

grid1 = GridSearchCV(lr, params, cv = 5, scoring = 'f1')

grid1.fit(Xtrain, ytrain)

In [None]:
grid1.best_params_, grid1.best_estimator_

({'C': 1}, LogisticRegression(C=1))

In [None]:
print("train:", classification_report(ytrain, grid1.predict(Xtrain)), "\ntest:", classification_report(ytest, grid1.predict(Xtest)))

train:               precision    recall  f1-score   support

           0       0.89      0.97      0.93      1993
           1       0.65      0.28      0.39       340

    accuracy                           0.87      2333
   macro avg       0.77      0.63      0.66      2333
weighted avg       0.85      0.87      0.85      2333
 
test:               precision    recall  f1-score   support

           0       0.88      0.97      0.92       857
           1       0.55      0.23      0.33       143

    accuracy                           0.86      1000
   macro avg       0.72      0.60      0.62      1000
weighted avg       0.84      0.86      0.84      1000



In [None]:
print("F1-score:", "\ntrain:", f(ytrain, grid1.predict(Xtrain)), "\ntest:", f(ytest, grid1.predict(Xtest)))
print("\n\nROC-AUC:", "\ntrain:", ra(ytrain, grid1.predict(Xtrain)), "\ntest:", ra(ytest, grid1.predict(Xtest)))

F1-score: 
train: 0.39014373716632444 
test: 0.3251231527093596


ROC-AUC: 
train: 0.6266602225436085 
test: 0.5996319899470425


Как мы видим, модель переобучилась разве что минимально.

# **Задание B** #

In [None]:
clf = SVC(kernel = 'linear', class_weight = "balanced")

In [None]:
pipeline = make_pipeline(StandardScaler(),
    GridSearchCV(SVC(kernel = 'linear', class_weight = "balanced"),
                 param_grid={"C": [0.01, 0.1, 1, 10, 100, 0.001], 'gamma': [0.1, 0.01, 1, 10, 100, 0.001]}, scoring='f1', cv=5))

pipeline.fit(Xtrain, ytrain)

In [None]:
print("train:", classification_report(ytrain, pipeline.predict(Xtrain)), "\ntest:", classification_report(ytest, pipeline.predict(Xtest)))

train:               precision    recall  f1-score   support

           0       0.95      0.78      0.86      1993
           1       0.37      0.77      0.50       340

    accuracy                           0.78      2333
   macro avg       0.66      0.77      0.68      2333
weighted avg       0.87      0.78      0.81      2333
 
test:               precision    recall  f1-score   support

           0       0.95      0.75      0.84       857
           1       0.34      0.76      0.47       143

    accuracy                           0.76      1000
   macro avg       0.64      0.76      0.65      1000
weighted avg       0.86      0.76      0.79      1000



Относительно метрики f1, модель практически не переобучилась - качество на трейне и тесте - 0,5 и 0,47.

In [None]:
print("F1-score:", "\ntrain:", f(ytrain, pipeline.predict(Xtrain)), "\ntest:", f(ytest, pipeline.predict(Xtest)))
print("\n\nROC-AUC:", "\ntrain:", ra(ytrain, pipeline.predict(Xtrain)), "\ntest:", ra(ytest, pipeline.predict(Xtest)))

F1-score: 
train: 0.5028790786948176 
test: 0.4685466377440347


ROC-AUC: 
train: 0.774907765414244 
test: 0.755101957552366


# **Задание D** #


Сначала выполним D, чтобы отсеять ненужные признаки

In [None]:
corr = tc.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
Account length,1.0,-0.012463,0.024735,0.002918,-0.004628,0.006216,0.03847,0.006214,-0.006757,0.01926,-0.006745,-0.008955,-0.013176,-0.00896,0.009514,0.020661,0.009546,-0.003796,0.016541
Area code,-0.012463,1.0,0.048551,-0.000747,-0.001994,-0.008264,-0.009646,-0.008264,0.00358,-0.011886,0.003607,-0.005825,0.016522,-0.005845,-0.018288,-0.024179,-0.018395,0.027572,0.006174
International plan,0.024735,0.048551,1.0,0.006006,0.008745,0.049396,0.003755,0.049398,0.0191,0.006114,0.019106,-0.028905,0.012451,-0.028913,0.045871,0.017366,0.04578,-0.024522,0.259852
Voice mail plan,0.002918,-0.000747,0.006006,1.0,0.956927,-0.001684,-0.011086,-0.001686,0.021545,-0.006444,0.021559,0.006079,0.015553,0.006064,-0.001318,0.007618,-0.001276,-0.017824,-0.102148
Number vmail messages,-0.004628,-0.001994,0.008745,0.956927,1.0,0.000778,-0.009548,0.000776,0.017562,-0.005864,0.017578,0.007681,0.007123,0.007663,0.002856,0.013957,0.002884,-0.013263,-0.089728
Total day minutes,0.006216,-0.008264,0.049396,-0.001684,0.000778,1.0,0.00675,1.0,0.007043,0.015769,0.007029,0.004323,0.022972,0.0043,-0.010155,0.008033,-0.010092,-0.013423,0.205151
Total day calls,0.03847,-0.009646,0.003755,-0.011086,-0.009548,0.00675,1.0,0.006753,-0.021451,0.006462,-0.021449,0.022938,-0.019557,0.022927,0.021565,0.004574,0.021666,-0.018942,0.018459
Total day charge,0.006214,-0.008264,0.049398,-0.001686,0.000776,1.0,0.006753,1.0,0.00705,0.015769,0.007036,0.004324,0.022972,0.004301,-0.010157,0.008032,-0.010094,-0.013427,0.205151
Total eve minutes,-0.006757,0.00358,0.0191,0.021545,0.017562,0.007043,-0.021451,0.00705,1.0,-0.01143,1.0,-0.012584,0.007586,-0.012593,-0.011035,0.002541,-0.011067,-0.012985,0.092796
Total eve calls,0.01926,-0.011886,0.006114,-0.006444,-0.005864,0.015769,0.006462,0.015769,-0.01143,1.0,-0.011423,-0.002093,0.00771,-0.002056,0.008703,0.017434,0.008674,0.002423,0.009233


Из пар Total minutes и charge оставим только минуты (все по два столбца фактически повторяют друг друга), также удалим Number vmail messages, он сильно коррелирует с Voice mail plan.

In [None]:
newtc = tc.drop("Total day charge", axis = 1).drop("Total night charge", axis = 1).drop("Total intl charge", axis = 1).drop("Total eve charge", axis = 1).drop("Voice mail plan", axis = 1)

In [None]:
corr = newtc.corr()
corr.style.background_gradient(cmap='coolwarm')

NameError: ignored

Также удалим Area code и Account length, а также Total day calls, Total eve calls и Total night calls которые очень слабо коррелируют с целевой переменной.

In [None]:
newtc = newtc.drop("Area code", axis = 1).drop("Account length", axis = 1).drop("Total day calls", axis = 1).drop("Total eve calls", axis = 1).drop("Total night calls", axis = 1)

# **Задание C** #

In [None]:
X = newtc.drop("Churn", axis = 1)

X1 = pd.get_dummies(X["State"].values)
X = X1.join(X).drop("State", axis = 1)

In [None]:
from sklearn.pipeline import make_pipeline
pipeline2 = make_pipeline(StandardScaler(), PolynomialFeatures(2),
    GridSearchCV(LogisticRegression(max_iter=10000), param_grid={"C": [0.01, 0.1, 1, 10, 100]}, scoring='f1', cv=5))

pipeline2.fit(Xtrain, ytrain)

In [None]:
print("train:", classification_report(ytrain, pipeline.predict(Xtrain)), "\ntest:", classification_report(ytest, pipeline.predict(Xtest)))

train:               precision    recall  f1-score   support

           0       0.95      0.78      0.86      1993
           1       0.37      0.77      0.50       340

    accuracy                           0.78      2333
   macro avg       0.66      0.77      0.68      2333
weighted avg       0.87      0.78      0.81      2333
 
test:               precision    recall  f1-score   support

           0       0.95      0.75      0.84       857
           1       0.34      0.76      0.47       143

    accuracy                           0.76      1000
   macro avg       0.64      0.76      0.65      1000
weighted avg       0.86      0.76      0.79      1000



In [None]:
print("F1-score:", "\ntrain:", f(ytrain, pipeline2.predict(Xtrain)), "\ntest:", f(ytest, pipeline2.predict(Xtest)))
print("\n\nROC-AUC:", "\ntrain:", ra(ytrain, pipeline2.predict(Xtrain)), "\ntest:", ra(ytest, pipeline2.predict(Xtest)))

F1-score: 
train: 0.8813008130081301 
test: 0.5043478260869565


ROC-AUC: 
train: 0.8975258994716803 
test: 0.6858777162161059


Модель оооооочень сильно переобучилась, но! По сравнению с остальными моделями качество на тесте лучше)))

In [None]:
from sklearn.pipeline import make_pipeline
pipeline3 = make_pipeline(StandardScaler(), PolynomialFeatures(3),
    GridSearchCV(LogisticRegression(), param_grid={"C": [0.01, 0.1, 1, 10, 100]}, scoring='f1', cv=5))

pipeline3.fit(Xtrain, ytrain)

print("train:", classification_report(ytrain, pipeline3.predict(Xtrain)), "\ntest:", classification_report(ytest, pipeline3.predict(Xtest)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

train:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1993
           1       1.00      1.00      1.00       340

    accuracy                           1.00      2333
   macro avg       1.00      1.00      1.00      2333
weighted avg       1.00      1.00      1.00      2333
 
test:               precision    recall  f1-score   support

           0       0.90      0.93      0.91       857
           1       0.46      0.36      0.41       143

    accuracy                           0.85      1000
   macro avg       0.68      0.65      0.66      1000
weighted avg       0.83      0.85      0.84      1000



In [None]:
print("F1-score:", "\ntrain:", f(ytrain, pipeline3.predict(Xtrain)), "\ntest:", f(ytest, pipeline3.predict(Xtest)))
print("\n\nROC-AUC:", "\ntrain:", ra(ytrain, pipeline3.predict(Xtrain)), "\ntest:", ra(ytest, pipeline3.predict(Xtest)))

F1-score: 
train: 1.0 
test: 0.40625000000000006


ROC-AUC: 
train: 1.0 
test: 0.6462289169407023


Добавление признаков степени 3 только усилило переобучение и ухудшило качество на тесте.

# **Задание E** #

**a)** Вывод: обычная логистическая регрессия плохо решает задачу, SVC с линейным ядром спраляется куда лучше (f1 0.5 и 0.47 (на трейне и на тесте), а ROC-AUC так и вообще 0.77 и 0,76!). Интересная ситуация происходит при добавлении полиномиальных признаков степени 2: модель переобучается, качество на трейне взлетает, но на тесте оно примерно примерно такое же (чуть лучше относительно f1-score и средне хуже относительно ROC-AUC), как и при применении SVC.

**Логистическая регрессия:**

F1-score:
train: 0.39014373716632444
test: 0.3251231527093596


ROC-AUC:
train: 0.6266602225436085
test: 0.5996319899470425

**SVC:**

F1-score:
train: 0.5028790786948176
test: 0.4685466377440347


ROC-AUC:
train: 0.774907765414244
test: 0.755101957552366

**polynomial(2):**

F1-score:
train: 0.8813008130081301
test: 0.5043478260869565


ROC-AUC:
train: 0.8975258994716803
test: 0.6858777162161059




**b)** были добавлены полиномиальные признаки, удалены
  1) слабокоррелирующие с целевой переменной признаки Area code и Account length, а также Total day calls, Total eve calls и Total night calls
  2) сильнокррелирующие с другим признаком Total day charge, Total night charge, Total intl charge, Total eve charge, Voice mail plan.

**c)** Да. Переобучение практически отсутствует при прирменении метода SCV: качество f1 на трейне и тесте 0.5 и 0,47 соответственно, а ROC-AUC так и вообще 0.77 и 0.76.