In [25]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.linear_model import LogisticRegression



In [26]:
df = pd.read_csv("Telco-Customer-Churn.csv")

# бинаризация 
binary_cols = ['customerID','gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
               'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
               'TotalCharges','Churn']

encoder = LabelEncoder()
for i in binary_cols:
    df[i] = encoder.fit_transform(df[i])

    
# del ne nuzhnoe 
df.drop(['customerID'], axis=1, inplace=True)

# замена пропущенных значений
df['TotalCharges'] = df['TotalCharges'].replace(' ', pd.NaT)
df['TotalCharges'] = df['TotalCharges'].replace('', pd.NaT)

# из объекта в числовую
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(inplace=True)


In [27]:
# стандартизация 
scaler = StandardScaler()
df.iloc[:, 1:-1] = scaler.fit_transform(df.iloc[:, 1:-1])

# препроцессинг 


selector = SelectKBest(f_classif, k=5)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support(indices=True)]
# print(selected_features)

#обучающая и тестовая выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# print(classification_report(y, y_pred))
# оценка метрик качества до препроцессинга
lr_before = LogisticRegression()
lr_before.fit(X_train, y_train)
y_pred_before = lr_before.predict(X_test)
accuracy_before = accuracy_score(y_test, y_pred_before)
recall_before = recall_score(y_test, y_pred_before)
f1_before = f1_score(y_test, y_pred_before)
print('До препроцессинга: accuracy = {:.2f}, recall = {:.2f}, F1-score = {:.2f}'.format(accuracy_before, recall_before, f1_before))


До препроцессинга: accuracy = 0.81, recall = 0.57, F1-score = 0.62


In [32]:





# обучение м
models = [
    LogisticRegression(),
    RandomForestClassifier()
]
for i in models:
    i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{type(i).__name__}: {accuracy}')


# выбор K 
selector = SelectKBest(f_classif, k=5)
selector

# обучение модели 
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)


y_pred_after = rfc.predict(X_test)
accuracy_after = accuracy_score(y_test, y_pred_after)
recall_after = recall_score(y_test, y_pred_after)
f1_after = f1_score(y_test, y_pred_after)
print('Posle препроцессинга: accuracy = {:.2f}, recall = {:.2f}, F1-score = {:.2f}'.format(accuracy_after, recall_after, f1_after))


# определение важности признаков
importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for i in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (i + 1, indices[i], importances[indices[i]]))


LogisticRegression: 0.8140525195173882
RandomForestClassifier: 0.7991483321504613
Posle препроцессинга: accuracy = 0.80, recall = 0.49, F1-score = 0.56
Feature ranking:
1. feature 17 (0.176539)
2. feature 4 (0.174955)
3. feature 18 (0.167721)
4. feature 14 (0.081976)
5. feature 16 (0.051153)
6. feature 8 (0.048006)
7. feature 11 (0.043074)
8. feature 9 (0.027847)
9. feature 0 (0.027486)
10. feature 15 (0.026085)
11. feature 7 (0.024514)
12. feature 2 (0.024310)
13. feature 6 (0.022742)
14. feature 10 (0.021655)
15. feature 3 (0.020808)
16. feature 1 (0.020452)
17. feature 13 (0.018083)
18. feature 12 (0.017067)
19. feature 5 (0.005527)


In [30]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,-0.439916,1.03453,-0.654012,-1.277445,-3.05401,0.062723,-1.183234,-0.918838,1.24255,-1.02791,-0.925262,-1.113495,-1.121405,-0.828207,0.829798,0.398558,-1.131766,-0.398608,0
1,1,-0.439916,-0.966622,-0.654012,0.066327,0.327438,-0.991588,-1.183234,1.407321,-1.029919,1.245111,-0.925262,-1.113495,-1.121405,0.371271,-1.205113,1.334863,-0.38774,-0.948762,0
2,1,-0.439916,-0.966622,-0.654012,-1.236724,0.327438,-0.991588,-1.183234,1.407321,1.24255,-1.02791,-0.925262,-1.113495,-1.121405,-0.828207,0.829798,1.334863,-0.517317,-1.641883,1
3,1,-0.439916,-0.966622,-0.654012,0.514251,-3.05401,0.062723,-1.183234,1.407321,-1.029919,1.245111,1.396299,-1.113495,-1.121405,0.371271,-1.205113,-1.474052,-0.872611,-0.98371,0
4,0,-0.439916,-0.966622,-0.654012,-1.236724,0.327438,-0.991588,0.17225,-0.918838,-1.029919,-1.02791,-0.925262,-1.113495,-1.121405,-0.828207,0.829798,0.398558,0.095041,-1.235224,1


In [33]:
топ-4 наиболее важных признаков для прогнозирования оттока клиентов в наборе данных Telco-Customer-Churn могут быть:

tenure (количество месяцев, в течение которых клиент был подключен)
TotalCharges (общая сумма, которую клиент заплатил за услуги)
MonthlyCharges (ежемесячная плата клиента за услуги)
Contract (тип контракта, заключенного с клиентом)

SyntaxError: invalid syntax (1727176089.py, line 1)