In [None]:
# Прогнозирование оттока клиентов - EDA

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np

# Config
BASE_DIR = os.path.dirname(os.getcwd())
DATA_PATH = os.path.join(BASE_DIR, 'data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')
REPORT_DIR = os.path.join(BASE_DIR, 'reports')

# Load data
df = pd.read_csv(DATA_PATH)
print(f"Dataset shape: {df.shape}")
df.head()

# Initial preprocessing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna(subset=['TotalCharges'])

# 1. Target distribution
plt.figure(figsize=(8, 5))
churn_counts = df['Churn'].value_counts()
plt.pie(churn_counts, labels=churn_counts.index, autopct='%1.1f%%', 
        colors=['#66b3ff','#ff9999'], startangle=90)
plt.title('Customer Churn Distribution')
plt.savefig(f'{REPORT_DIR}/churn_distribution.png', bbox_inches='tight')
plt.show()

# 2. Numerical features analysis
num_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for i, feature in enumerate(num_features):
    sns.boxplot(x='Churn', y=feature, data=df, ax=axes[i], palette='Set2')
    axes[i].set_title(f'{feature} by Churn Status')
plt.tight_layout()
plt.savefig(f'{REPORT_DIR}/numerical_features.png')
plt.show()

# 3. Categorical features analysis
cat_features = ['Contract', 'InternetService', 'PaymentMethod', 'TechSupport']
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for i, feature in enumerate(cat_features):
    # Calculate percentages
    ct = pd.crosstab(df[feature], df['Churn'], normalize='index') * 100
    ct = ct.reset_index().melt(id_vars=feature, var_name='Churn', value_name='Percentage')
    
    # Plot
    sns.barplot(x=feature, y='Percentage', hue='Churn', data=ct, ax=axes[i], palette='viridis')
    axes[i].set_title(f'Churn Rate by {feature}')
    axes[i].set_ylabel('Percentage (%)')
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_ylim(0, 100)
    
    # Add labels
    for p in axes[i].patches:
        height = p.get_height()
        axes[i].text(p.get_x() + p.get_width()/2., height + 2,
                f'{height:.1f}%', ha='center', fontsize=9)

plt.tight_layout()
plt.savefig(f'{REPORT_DIR}/categorical_features.png')
plt.show()

# 4. Correlation analysis
corr_matrix = df[num_features].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.savefig(f'{REPORT_DIR}/correlation_matrix.png')
plt.show()