In [1]:
# !pip install psycopg2-binary sqlalchemy pandas

In [57]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy
from google.colab import userdata
from pathlib import Path

In [58]:
# 1.Data load process, API Kaggle, Neon conection test

  # 1.1 API de Kaggle
os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')

  # 1.2 Data dowload
DATA_PATH = "/content/drive/MyDrive/[03]_proyectos_/GitHub/telco_customers_project/data"

!kaggle datasets download -d blastchar/telco-customer-churn -p {DATA_PATH} --unzip

print(f"✅ Dowload completed {DATA_PATH}")

# 1.3. Neon conection test
DB_URL = userdata.get('DB_URL')

try:
    engine = sqlalchemy.create_engine(DB_URL)
    with engine.connect() as conn:
        print("Connection successful")
except Exception as e:
    print(f"Conection error: {e}")

Dataset URL: https://www.kaggle.com/datasets/blastchar/telco-customer-churn
License(s): copyright-authors
Downloading telco-customer-churn.zip to /content/drive/MyDrive/[03]_proyectos_/GitHub/telco_customers_project/data
  0% 0.00/172k [00:00<?, ?B/s]
100% 172k/172k [00:00<00:00, 24.0MB/s]
✅ Dowload completed /content/drive/MyDrive/[03]_proyectos_/GitHub/telco_customers_project/data
Connection successful


In [59]:
# Data load and preprocessing

def data_load_preprocessing (base_path, columns_numeric):

  """

  """

  # Base path
  base_path_ = Path(base_path)

  # Checking for .csv files
  csv_files = list(base_path_.glob('*.csv'))
  if not csv_files:
    print("No .csv files in the directory")
    return None
  else:
   target_file = csv_file[0]
   print(f"File detected: {target_file.name}\n")

  # Data load
  df = pd.read_csv(target_file)

  # Column's name to lower and blak space replace with "_"
  df.columns = [c.lower().replace(" ", "_")for c in df.columns]

  # Column's data type changed tu numeric
  for col in columns_numeric:
    df[col] = pd.to_numeric(df[col], errors="coerce")
  print(f"{df.info()}\n Data description:\n{df.describe()}\n Null data to be addressed\n{df.isnull().sum()}\n")

  return df





In [60]:
data_load_preprocessing("/content/drive/MyDrive/[03]_proyectos_/GitHub/telco_customers_project/data", ["monthlycharges", "totalcharges"])

File detected: WA_Fn-UseC_-Telco-Customer-Churn.csv

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerid        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   seniorcitizen     7043 non-null   int64  
 3   partner           7043 non-null   object 
 4   dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   phoneservice      7043 non-null   object 
 7   multiplelines     7043 non-null   object 
 8   internetservice   7043 non-null   object 
 9   onlinesecurity    7043 non-null   object 
 10  onlinebackup      7043 non-null   object 
 11  deviceprotection  7043 non-null   object 
 12  techsupport       7043 non-null   object 
 13  streamingtv       7043 non-null   object 
 14  streamingmovies   7043 non-null   object 
 15  contract          7043 non-null   ob

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


In [None]:
# Fill Null data with '0' (numeric), preserving original columns for future references

def null_to_zero_or(columns_names, new_columns):
  for col in columns_names:
    if df[col].isnull():
