# Customer Segmentation Data Pipeline

## 1.Data Ingestion

##### Read CSV

#### Validate schema (column names, types)

#### Log basic stats

In [4]:
import numpy as np
import pandas as pd

In [7]:
raw_df = pd.read_csv('data/raw/Customer_churn4.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/Customer_churn4.csv'

In [None]:
raw_df.head(5)

In [None]:
raw_df.info()

## 2.Data Validation and Profiling

### Check for:

#### Missing values

#### Duplicates

#### Inconsistent types

#### Unique values in categorical fields

In [None]:
raw_df.isnull().sum()

### No missing values

In [None]:
raw_df.duplicated().sum()

### No duplicate values

### Value distribution in categorical features

In [None]:
for col in raw_df.select_dtypes(include='object').columns:
    print(f'{col}, unique values: {raw_df[col].nunique()}')

### Total Charges feature supposed to be a float data type instead of object

## 3.Data Cleaning

### Ensure clean, consistent, structured data for downstream users.

#### Handle missing values

#### Convert types

#### Normalize inconsistent string formats

#### Drop or flag invalid rows

In [None]:
raw_df['TotalCharges'] = pd.to_numeric(raw_df['TotalCharges'], errors='coerce')

### There are 11 missing values in Total Charges feature

In [None]:
raw_df['TotalCharges'].isnull().sum()

### Filling missing values with 0

In [None]:
raw_df['TotalCharges'].fillna(0, inplace=True)

In [None]:
raw_df['TotalCharges'].isnull().sum()

In [None]:
clean_df = raw_df.drop_duplicates()

## 4.Feature Standardization

### Prepare clean, encoded, and standardized features for downstream teams.

#### Encode binary values

#### Map categorical values

#### Normalize column names

In [None]:
# Remane columns
clean_df.columns = clean_df.columns.str.strip().str.title().str.replace(' ', '_')

# Convert binary fields
binary_map = {'Yes': 1, 'No': 0}
clean_df['Partner'] = clean_df['Partner'].map(binary_map)
clean_df['Dependents'] = clean_df['Dependents'].map(binary_map)
