Финальный аналитический отчёт: отток клиентов
Цель: определить ключевые факторы, влияющие на отток клиентов, и подготовить данные для построения интерактивных дашбордов.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import psycopg2


df = pd.read_csv("../data/processed/clean_churn.csv")

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
df['contract_lenght']=df['Contract'].map({'Month-to-month':1, 'One year':12, 'Two year':24})

df['has_security']=df['OnlineSecurity'].map({'Yes':1, 'No':0, 'No internet service':0})

df['has_techsupport']=df['TechSupport'].map({'Yes':1, 'No':0, 'No internet service':0})

df['services_count'] = df[['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']].replace({'Yes':1, 'No':0, 'No internet service':0}).sum(axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


  df['services_count'] = df[['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']].replace({'Yes':1, 'No':0, 'No internet service':0}).sum(axis=1)


In [3]:
import os
from sqlalchemy import create_engine, text
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

PGHOST = os.getenv("PGHOST", "localhost")
PGPORT = os.getenv("PGPORT", "5432")
PGDATABASE = os.getenv("PGDATABASE", "churn_database")
PGUSER = os.getenv("PGUSER", "churn_user")
PGPASSWORD = os.getenv("PGPASSWORD", "your_strong_password")

conn_str = f"postgresql+psycopg2://{PGUSER}:{PGPASSWORD}@{PGHOST}:{PGPORT}/{PGDATABASE}"

engine = create_engine(conn_str, pool_size=5, max_overflow=10)

df.to_sql('churn_data', con=engine, if_exists='replace', index=False, method='multi', chunksize=1000)

q = "SELECT * FROM churn_data LIMIT 2;"
sample = pd.read_sql_query(q, con=engine)
print(sample.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   

      MultipleLines InternetService OnlineSecurity  ...        Contract  \
0  No phone service             DSL             No  ...  Month-to-month   
1                No             DSL            Yes  ...        One year   

  PaperlessBilling     PaymentMethod MonthlyCharges TotalCharges Churn  \
0              Yes  Electronic check          29.85        29.85    No   
1               No      Mailed check          56.95      1889.50    No   

  contract_lenght has_security  has_techsupport  services_count  
0               1            0                0               1  
1              12            1                0               2  

[2 rows x 25 columns]


In [None]:
df.to_csv("../data/processed/churn_final.csv", index = False)