In [1]:
# pipenv install pandas plotly scikit-learn optuna ipywidgets ipykernel nbformat gradio

In [2]:
import pandas as pd

from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import recall_score

### Carga dos Dados

In [3]:
df_churn = pd.read_csv(r'.\datasets\churn_telecom.csv')

In [4]:
# Visualizar estrutura
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   IDCliente         7032 non-null   object 
 1   Genero            7032 non-null   object 
 2   Mais65anos        7032 non-null   int64  
 3   TemParceiro       7032 non-null   object 
 4   TemDependentes    7032 non-null   object 
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  tenure            7032 non-null   int64  
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [5]:
df_churn.head(10)

Unnamed: 0,IDCliente,Genero,Mais65anos,TemParceiro,TemDependentes,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,No,No phone service,DSL,No,Yes,...,No,No,No,1,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,Yes,No,DSL,Yes,No,...,No,No,No,34,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,Yes,No,DSL,Yes,Yes,...,No,No,No,2,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,No,No phone service,DSL,Yes,No,...,Yes,No,No,45,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,Yes,No,Fiber optic,No,No,...,No,No,No,2,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,Yes,Yes,Fiber optic,No,No,...,No,Yes,Yes,8,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,Yes,Yes,Fiber optic,No,Yes,...,No,Yes,No,22,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,No,No phone service,DSL,Yes,No,...,No,No,No,10,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,Yes,Yes,Fiber optic,No,No,...,Yes,Yes,Yes,28,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,Yes,No,DSL,Yes,Yes,...,No,No,No,62,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [6]:
# Contar clientes usando a variável churn como referência
df_churn.Churn.value_counts()

Churn
No     5163
Yes    1869
Name: count, dtype: int64

In [7]:
df_churn.Churn.value_counts(normalize=True)

Churn
No     0.734215
Yes    0.265785
Name: proportion, dtype: float64

### Preparação da Base para o algoritmo LOF

In [8]:
# Selecionar as clolunas para o algoritmo
X = df_churn.drop(columns=['IDCliente', 'Churn'], axis=1)
y = df_churn.Churn

In [9]:
# Definir uma função para transformar "yes" em 1 e "no" em 0
def binary_transformer_function(X):
    return X.map(lambda x: 1 if x == 'Yes' else 0)

In [None]:
# Transformações
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = ['Genero', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection','StreamingTV', 'StreamingMovies',
                        'Contract', 'PaymentMethod']
binary_features = ['TemParceiro', 'TemDependentes', 'TechSupport', 'PhoneService', 'PaperlessBilling']
no_transformation_features = ['Mais65anos']

# Criar Transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()
binary_transformer = FunctionTransformer(binary_transformer_function)

# Criar o preprocessor

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features),
    ('bin', binary_transformer, binary_features),
    ('pass', 'passthrough', no_transformation_features)
])

# Transformar os dados
X_transformed = preprocessor.fit_transform(X)

In [13]:
X_transformed.shape


(7032, 39)

### Treinar o Algoritmo LOF

In [14]:
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.26)

In [15]:
# Treianr algoritmo e gerar classificacoes de anomalias para cada ponto de dados
y_pred = lof.fit_predict(X_transformed)

In [18]:
# Mostrar valores preditos (anomalia ou não anomalia)
# No sklearn o predict gera um valor -1 para anomalia e 1 para valores normais
y_pred

array([ 1,  1,  1, ...,  1, -1,  1], shape=(7032,))

In [19]:
# Mostrar o lof calculado para cada ponto de dados
# No sklearn, o LOF calculado fica na propriedade negative_outlier_factor_
# negative_outlier_factor_ é o inversor do LOF, Quanto menos mais anormal
-lof.negative_outlier_factor_

array([1.0238333 , 1.03547225, 1.02610568, ..., 1.07053634, 1.19840027,
       1.08901757], shape=(7032,))

### Apresentar Resultados

In [20]:
import numpy as np

# Identificar anomalias

outliers = y_pred == -1
inliers = y_pred == 1

# Contar anomalias e os pontos normais
num_outliers = np.sum(outliers)
num_inliers = np.sum(inliers)

# Apresentar estatisticas
print(f'Anomalias detectadas: {num_outliers}')
print(f'Pontos Normais: {num_inliers}')

Anomalias detectadas: 1829
Pontos Normais: 5203


In [24]:
# Converter y para a mesma base do y_pred
y_true = y.map(lambda x: -1 if x == 'Yes' else 1)


In [25]:
# Calcular Score com base no valor y (Churn real da base)
# Usar Recall, pois o objetivo principal é maximizar o TPR(True Positive Base)
recall_score(y_true, y_pred)

0.7515010652721286