<a href="https://colab.research.google.com/github/AyushmanRaha/BizLens-Analytics/blob/main/BizLens_Analytics_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Basic Setup and Exploration**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import requests
import zipfile
import gzip

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

**Stores all the target columns idenfied.**

In [None]:
target_column_list = []

# **Helper Functions**

**1) load_data() - For loading the data**

In [None]:
def load_data(file_path, sheet_name=None):

  def is_url(path):
    return path.startswith(("http://", "https://", "ftp://"))

  if is_url(file_path):
    response = requests.get(file_path)
    response.raise_for_status()
    content = io.BytesIO(response.content)
  else:
    content = file_path

  file_lower = file_path.lower()

  try:
    if file_lower.endswith('.zip'):
      with zipfile.ZipFile(content, 'r') as z:
        file_inside = z.namelist()[0]
        with z.open(file_inside) as f:
          if file_inside.endswith('.csv'):
            df = pd.read_csv(f)
          elif file_inside.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(f, sheet_name=sheet_name)
          else:
            raise ValueError("Unsupported file inside ZIP.")

    elif file_lower.endswith('.gz'):
      with gzip.open(content, 'rt') as f:
        df = pd.read_csv(f)

    elif file_lower.endswith('.csv'):
        df = pd.read_csv(content)

    elif file_lower.endswith(('.xls', '.xlsx')):
        df = pd.read_excel(content, sheet_name=sheet_name)

    else:
        raise ValueError("Unsupported file type. Provide a CSV, Excel, ZIP, or GZ file.")

    print(f"Succesfully loaded data from: {file_path}")
    return df

  except Exception as e:
    print(f"Error loading data from {file_path}: {e}")
    return None

**2) initial_exploration() - For initial analysis**

In [None]:
def initial_exploration(df: pd.DataFrame):
  separator = "="*80
  print("\n" + separator)
  print("üîç DATAFRAME OVERVIEW")
  print(separator)
  print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")
  print(separator)
  print("\nüìã INFO")
  print(separator)
  df.info()
  print("\nüìä SUMMARY STATISTICS")
  print(separator)
  print(df.describe(include='all').T)
  print("\nüëÄ SAMPLE ROWS (HEAD)")
  print(separator)
  print(df.head())
  print("\n‚úÖ END OF EXPLORATION")
  print(separator)

**3) clean_and_correct_data() - Removing empty and non numeric values from the dataFrame**

In [None]:
def clean_and_correct_data(df, dropped_columns=None, numeric_columns=None, fill_value=0, verbose=True):

  #Creating a copy of the original dataFrame
  df1 = df.copy()

  # Changing the column names to lower case
  df1.columns = [c.strip().lower() for c in df1.columns]

  if numeric_columns is not None:
    numeric_columns_list = [c.strip().lower() for c in numeric_columns] # Changing to lower case
  else:
    numeric_columns_list = df1.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if verbose:
      print(f"Auto-detected possible numeric columns: {numeric_columns_list}")

  if dropped_columns is not None:
    dropped_columns_list = [c.strip().lower() for c in dropped_columns] # Changing to lower case

  if dropped_columns is not None:
    df1 = df1.drop(dropped_columns_list, axis=1)

  #Changing all the values to numeric type and handling empty values.
  for col in numeric_columns_list:
    df1[col] = pd.to_numeric(df1[col], errors='coerce')
    df1[col] = df1[col].fillna(fill_value)

  null_summary = df1.isnull().sum()
  has_null = df1.isnull().any().any()

  if verbose:
    print("\nData Cleaning Summary:")
    print("-" * 50)
    print("Dropped Columns:")
    a=0
    for col in dropped_columns:
      a=a+1
      print(f"{a}) {col}")
    print("-" * 50)
    print(f"Null values remaining in the DataFrame: {has_null}")
    print("-" * 50)
    print(f"Shape: {df1.shape[0]} rows x {df1.shape[1]} columns")
    if has_null:
      print("\nColumns with remaining nulls:\n", null_summary[null_summary > 0])
    print("-" * 50)
    print(f"Updated Column list:")
    c=0
    for col in df1.columns.tolist():
      c=c+1
      print(f"{c}) {col}, dtype: {df1[col].dtype}")
    print("-" * 50)

  return df1 # Returning the modified dataFrame

**4) create_churn_count_dictionaries() - Used for creating dictionaries which calculates churning counts for each categories.**

In [None]:
def create_churn_count_dictionaries(df, verbose=True):

    column_list = [
        col for col in df.select_dtypes(include=['object']).columns
        if "No" in df[col].value_counts().index.tolist() # Include only columns which have 'No' as a value
    ]

    target_column_list = column_list.copy() # Making a copy of the column list

    # Removing 'churn' column from the column list
    if 'churn' in column_list:
        column_list.remove('churn')

    # Creating a separate list of only numeric columns
    numeric_columns = [col for col in df.select_dtypes(include=['int64', 'float64']).columns]

    # Creating a dictionary of 'churn count dictionaries'
    log_of_dictionaries = {}

    for col in column_list:
        base = col.lower() # The prefix of the dictionary is the respective column heading

        churn_dict = {
            "male": {
                "senior": {
                    "Yes": {
                        "churn": {f"{base}_Yes_male_senior_churn_count": 0},
                        "not_churn": {f"{base}_Yes_male_senior_not_churn_count": 0}
                    },
                    "No": {
                        "churn": {f"{base}_No_male_senior_churn_count": 0},
                        "not_churn": {f"{base}_No_male_senior_not_churn_count": 0}
                    }
                },
                "non_senior": {
                    "Yes": {
                        "churn": {f"{base}_Yes_male_non_senior_churn_count": 0},
                        "not_churn": {f"{base}_Yes_male_non_senior_not_churn_count": 0}
                    },
                    "No": {
                        "churn": {f"{base}_No_male_non_senior_churn_count": 0},
                        "not_churn": {f"{base}_No_male_non_senior_not_churn_count": 0}
                    }
                },
            },
            "female": {
                "senior": {
                    "Yes": {
                        "churn": {f"{base}_Yes_female_senior_churn_count": 0},
                        "not_churn": {f"{base}_Yes_female_senior_not_churn_count": 0}
                    },
                    "No": {
                        "churn": {f"{base}_No_female_senior_churn_count": 0},
                        "not_churn": {f"{base}_No_female_senior_not_churn_count": 0}
                    }
                },
                "non_senior": {
                    "Yes": {
                        "churn": {f"{base}_Yes_female_non_senior_churn_count": 0},
                        "not_churn": {f"{base}_Yes_female_non_senior_not_churn_count": 0}
                    },
                    "No": {
                        "churn": {f"{base}_No_female_non_senior_churn_count": 0},
                        "not_churn": {f"{base}_No_female_non_senior_not_churn_count": 0}
                    }
                },
            }
        }

        top_key = f"{base}ChurnCount"
        log_of_dictionaries[top_key] = churn_dict

    if verbose:
        print(f"Number of non-numeric dictionaries created: {len(log_of_dictionaries)}")
        print("Dictionary names:")
        print("-" * 50)
        for idx, name in enumerate(log_of_dictionaries.keys(), start=1):
            print(f"{idx}) {name}")
        print("-" * 50)
        print(f"Type: {type(log_of_dictionaries)}")


    return log_of_dictionaries # Returning the dictionary

**5) calculate_dictionary_churn_count() - Used to calculate all churn for the specified target column**

In [None]:
def calculate_dictionary_churn_count(df, dictionaries_list, target_column=None):

    mandatory_columns = ['seniorcitizen', 'gender', 'churn']

    if target_column is None:
        raise ValueError("target_column must be provided.")

    target_column_lower = target_column.strip().lower() # Lower casing the target column
    mandatory_columns.append(target_column_lower)

    missing = [col for col in mandatory_columns if col not in df.columns]
    if missing:
        raise KeyError(f"Mandatory columns missing from DataFrame: {', '.join(missing)}")

    dict_key = f"{target_column_lower}ChurnCount"
    if dict_key not in dictionaries_list:
        raise KeyError(f"Dictionary '{dict_key}' not found in dictionaries_list. "
                       "Ensure create_churn_count_dictionaries() was run correctly.")

    dictionary = dictionaries_list[dict_key]
    base = target_column_lower

    for row in df.itertuples(index=False):

        # Extracting row wise values under each of the mandatory columns
        seniority_val = row.seniorcitizen
        gender_val = row.gender
        churn_val = row.churn

        # Extracting row wise value under the target column
        target_val = getattr(row, target_column_lower)

        gender_key = gender_val.lower() # Lower casing the gender type

        seniority_key = "senior" if seniority_val == 1 else "non_senior"

        churn_key = "churn" if churn_val == 'Yes' else "not_churn"

        # Under each target column, the values can be 'Yes' , 'No' or something else
        # If it's 'Yes' or anything without a 'No' then target_key = "Yes"
        # If target_val starts with a 'No' then target_key = "No"

        if target_val == "Yes" or "No" not in target_val:
            target_key = "Yes"
        elif target_val.startswith('No'):
            target_key = "No"
        else:
            continue

        # Case under which the value will be finally stored in the dictionary.
        key_string = f"{base}_{target_key}_{gender_key}_{seniority_key}_{churn_key}_count"

        try:
            # Iterating the value by 1 after checking.
            dictionary[gender_key][seniority_key][target_key][churn_key][key_string] += 1
        except KeyError:
            print(f"Warning: Skipping row. Could not find key path for: {key_string}")

    return dictionary

# **Initial exploration and Data Cleaning/Corrections**

**Loading the data**

In [None]:
df = load_data("https://raw.githubusercontent.com/AyushmanRaha/customer_churn_prediction/refs/heads/main/data/customer_data.csv")

Succesfully loaded data from: https://raw.githubusercontent.com/AyushmanRaha/customer_churn_prediction/refs/heads/main/data/customer_data.csv


**Calling the initial_exploration() function on the dataFrame.**

In [None]:
initial_exploration(df)


üîç DATAFRAME OVERVIEW
Shape: 7043 rows x 21 columns

üìã INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 

**Creating the modified dataFrame**

In [None]:
df = clean_and_correct_data(df, dropped_columns=['customerid'], numeric_columns=['totalcharges', 'monthlycharges', "tenure"])


Data Cleaning Summary:
--------------------------------------------------
Dropped Columns:
1) customerid
--------------------------------------------------
Null values remaining in the DataFrame: False
--------------------------------------------------
Shape: 7043 rows x 20 columns
--------------------------------------------------
Updated Column list:
1) gender, dtype: object
2) seniorcitizen, dtype: int64
3) partner, dtype: object
4) dependents, dtype: object
5) tenure, dtype: int64
6) phoneservice, dtype: object
7) multiplelines, dtype: object
8) internetservice, dtype: object
9) onlinesecurity, dtype: object
10) onlinebackup, dtype: object
11) deviceprotection, dtype: object
12) techsupport, dtype: object
13) streamingtv, dtype: object
14) streamingmovies, dtype: object
15) contract, dtype: object
16) paperlessbilling, dtype: object
17) paymentmethod, dtype: object
18) monthlycharges, dtype: float64
19) totalcharges, dtype: float64
20) churn, dtype: object
------------------------

In [None]:
df.head() # Checking the first 5 values of the dataFrame after modification

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


**Iterating through each column of the DataFrame**

In [None]:
for column in df.columns:
    print(f"Value Counts for '{column}':\n")

    print(df[column].value_counts(normalize=True)*100)
    print("-" * (len(column) + 24))
    print(" ")
    print(" ")

Value Counts for 'gender':

gender
Male      50.47565
Female    49.52435
Name: proportion, dtype: float64
------------------------------
 
 
Value Counts for 'seniorcitizen':

seniorcitizen
0    83.785319
1    16.214681
Name: proportion, dtype: float64
-------------------------------------
 
 
Value Counts for 'partner':

partner
No     51.69672
Yes    48.30328
Name: proportion, dtype: float64
-------------------------------
 
 
Value Counts for 'dependents':

dependents
No     70.041176
Yes    29.958824
Name: proportion, dtype: float64
----------------------------------
 
 
Value Counts for 'tenure':

tenure
1     8.703677
72    5.139855
2     3.379242
3     2.839699
4     2.498935
        ...   
28    0.809314
39    0.795116
44    0.724123
36    0.709925
0     0.156183
Name: proportion, Length: 73, dtype: float64
------------------------------
 
 
Value Counts for 'phoneservice':

phoneservice
Yes    90.316626
No      9.683374
Name: proportion, dtype: float64
------------------------

**Creating all the dictionaries required to store churn count as per various columns.**

In [None]:
target_column_churn_count_dictionaries = create_churn_count_dictionaries(df)

Number of non-numeric dictionaries created: 12
Dictionary names:
--------------------------------------------------
1) partnerChurnCount
2) dependentsChurnCount
3) phoneserviceChurnCount
4) multiplelinesChurnCount
5) internetserviceChurnCount
6) onlinesecurityChurnCount
7) onlinebackupChurnCount
8) deviceprotectionChurnCount
9) techsupportChurnCount
10) streamingtvChurnCount
11) streamingmoviesChurnCount
12) paperlessbillingChurnCount
--------------------------------------------------
Type: <class 'dict'>


**Calculating churn for each of the above mentioned dictionaries.**

In [None]:
def calculate_churn():
  for column in target_column_list:
    calculate_dictionary_churn_count(df, target_column_churn_count_dictionaries, target_column=column)

In [None]:
calculate_churn()

# **Exploratory Data Analysis**

EDA 1: Identify high risk "exit points" based on how customers are billed.

EDA 2: Determine the "Danger Zone" - the specific time frame and price point where customers are most likely to drop off.

EDA 3: Quantify how "bundled services" affect cutomer loyalty.