# **1. Perkenalan Dataset**

**Dataset**: Telco Customer Churn  
**Sumber**: IBM GitHub Repository  

**Deskripsi**: Dataset ini berisi informasi pelanggan perusahaan telekomunikasi. Tujuannya adalah memprediksi apakah pelanggan akan berhenti berlangganan (churn) atau tidak.

**Target**: Kolom 'Churn' (Yes/No)

# **2. Import Library**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')
print('Library berhasil diimport!')

# **3. Memuat Dataset**

In [None]:
# Load dataset dari file CSV
df = pd.read_csv('../Telco-Customer-Churn.csv')
print(f'Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns')

In [None]:
# Tampilkan 5 baris pertama
df.head()

In [None]:
# Info dataset
df.info()

# **4. Exploratory Data Analysis (EDA)**

In [None]:
# Statistik deskriptif
df.describe()

In [None]:
# Cek missing values
missing = df.isnull().sum()
print('Missing values:')
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print('No missing values')

In [None]:
# Distribusi target variable
print('Distribusi Churn:')
print(df['Churn'].value_counts())
print('\nPersentase:')
print(df['Churn'].value_counts(normalize=True) * 100)

In [None]:
# Visualisasi distribusi Churn
plt.figure(figsize=(8, 5))
df['Churn'].value_counts().plot(kind='bar', color=['green', 'red'])
plt.title('Distribusi Customer Churn', fontsize=14, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Encode Churn untuk analisis korelasi
df['Churn_Encoded'] = df['Churn'].map({'No': 0, 'Yes': 1})
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Korelasi dengan target
corr_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen', 'Churn_Encoded']
correlation = df[corr_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# **5. Data Preprocessing**

In [None]:
# Handle TotalCharges (convert to numeric and fill NaN)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
print('TotalCharges processed')

In [None]:
# Drop unnecessary columns
df_processed = df.drop(['customerID', 'Churn_Encoded'], axis=1)
print(f'Columns after drop: {df_processed.shape[1]}')

In [None]:
# Separate features and target
X = df_processed.drop('Churn', axis=1)
y = df_processed['Churn']

# Encode target variable
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)
print(f'Target encoding: {dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))}')

In [None]:
# Encode categorical features
X_encoded = X.copy()
categorical_cols = X_encoded.select_dtypes(include=['object']).columns

print(f'Encoding {len(categorical_cols)} categorical columns...')
for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])

print(f'Encoded {len(categorical_cols)} categorical columns')

In [None]:
# Feature scaling
scaler = StandardScaler()
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
X_encoded[numeric_features] = scaler.fit_transform(X_encoded[numeric_features])
print('Feature scaling completed')

In [None]:
# Save processed data
final_df = X_encoded.copy()
final_df['Churn'] = y_encoded
output_file = 'Telco-Customer-Churn-Processed.csv'
final_df.to_csv(output_file, index=False)
print(f'Data saved to: {output_file}')
print(f'Shape: {final_df.shape}')
print('\nðŸ“‹ Summary:')
print(final_df.head())