# Doctoralia Assessment

# Questions

# Table of Contents

1. [How many facilities do we have?](#1-how-many-facilities-do-we-have)
2. [How many premium facilities do we have?](#2-how-many-premium-facilities-do-we-have)
3. [What’s the facilities distribution by size?](#3-whats-the-facilities-distribution-by-size)
4. [How many valid phones do we have?](#4-how-many-valid-phones-do-we-have)
5. [What’s the state with the most premium facilities as of today?](#5-whats-the-state-with-the-most-premium-facilities-as-of-today)
6. [How many facilities have churned since June 2022?](#6-how-many-facilities-have-churned-since-june-2022)
7. [What’s the top 3 states with the highest churn percentage?](#7-whats-the-top-3-states-with-the-highest-churn-percentage)
8. [What’s the churn probability for facilities in its 5th month (at national level)?](#8-whats-the-churn-probability-for-facilities-in-its-5th-month-at-national-level)
9. [What’s the top 3 states with the most invalid phone number proportion?](#9-whats-the-top-3-states-with-the-most-invalid-phone-number-proportion)
10. [What’s the average facilities lifespan?](#10-whats-the-average-facilities-lifespan)
11. [What’s the top 3 states with the biggest facilities?](#11-whats-the-top-3-states-with-the-biggest-facilities)
12. [Do we have duplicated phone numbers?](#12-do-we-have-duplicated-phone-numbers)
13. [What’s the top 3 valid duplicated numbers?](#13-whats-the-top-3-valid-duplicated-numbers)
14. [What’s the top 3 states with the most valid duplicated numbers?](#14-whats-the-top-3-states-with-the-most-valid-duplicated-numbers)
15. [What is the relation between valid/invalid phones with churn propensity?](#15-what-is-the-relation-between-validinvalid-phones-with-churn-propensity)


# Data Acquisition

In [53]:
import pandas as pd
import numpy as np
import logging
from datetime import datetime
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [54]:
log_format = '%(asctime)s %(levelname)s: \n%(message)s\n'

# Configure the logger with the custom format
logging.basicConfig(filename="logs/LoggerDoctoralia.log",
                    level=logging.INFO,
					format=log_format)

In [None]:
df = pd.read_csv("data/CS_Ops_Assessment_dataset_JuanReyes.csv")

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
#Let's get rid of the trailing white spaces of the columns
df.columns = df.columns.str.strip()
#Let's get rid of the trailing white spaces of the all the rows
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping in the display
pd.set_option('display.width', None)
pd.set_option('max_colwidth', None)

In [None]:
#Visualize the unique values per column of interest
print(f"""
Unique Facility Categories:
{df['facility_category'].unique()}

Unique Facility Sizes:
{df['facility_size'].unique()}

Unique States:
{df['state'].unique()}

Unique Cities:
{df['city'].unique()}

Unique Is_premium:
{df["is_premium"].unique()}
""")


# Initial Data Analysis (IDA)

In [None]:
#Replace standalone word 'na' not words that contain 'nan'
df = df.replace(r'\bna\b', np.nan, regex=True)

In [None]:
#Replace NaN for NaT & ensuring dates are in datetime64
def parse_date(date_str):
    if pd.isna(date_str):
        return pd.NaT
    try:
        return pd.to_datetime(date_str, format="%m/%d/%Y")
    except ValueError:
        print(f"Failed to parse '{date_str}'")
        pass

    return pd.NaT

df['churn_since'] = df['churn_since'].apply(parse_date)
df['premium_since'] = df['premium_since'].apply(parse_date)

In [None]:
df['is_churn'] = df['is_churn'].astype('Int64') #We do this to handle NaNs & to ensure nums are int

In [None]:
df['is_premium'] = df['is_premium'].astype(int)  # Ensure it is an int

In [None]:
df['phone'] = df['phone'].astype(int) #If we leave it as float there are mistakes
df['phone'] = df['phone'].astype(str) #We need this to determine valid numbers

In [None]:
df.info()

# EDA

In [None]:
df.describe()

# 1. How many facilities do we have?

In [None]:
#Assuming facility_id is a primary key then the number of facilities is simply the total number of unique entries
total_facilities = df["facility_id"].nunique()

print(total_facilities)


# 2. How many premium facilities do we have?

In [None]:
#Two approaches
total_premium_facilities = df['is_premium'].value_counts().get(1, 0)
total_premium_facilities_query = df.query('is_premium == 1').shape[0]


In [None]:
print(total_premium_facilities)
print(total_premium_facilities_query)

# 3. What’s the facilities distribution by size?

In [None]:
# Mimicking a groupby operation
facility_size_distribution = df['facility_size'].value_counts()
print(facility_size_distribution)


# 4. How many valid phones do we have?


In [57]:

#All valid numbers in MX need to have 10 digits: https://telmex.com/10digitos

df_ten_digits_phones = df.loc[df['phone'].str.len() == 10].copy()


#Surprisingly hard to find a reliable source for a complete list of LADA. Got it from https://www.bajaregroup.com/pdf/mexican_area_codes.pdf

with open('data/TodasLadaMexico.md', 'r', encoding='utf-8') as file:
    md_content = file.read()

lines = md_content.strip().split('\n')

data = []
for line in lines:
    # Split on the last occurrence of '. '
    parts = line.rsplit('. ', 1)
    if len(parts) == 2:
        city_with_state, lada = parts
        data.append({'city_with_state': city_with_state, 'lada': lada})

df_lada = pd.DataFrame(data)

df_lada['lada'] = df_lada['lada'].astype(str)

lada_numbers = set(df_lada['lada'])

def check_lada_vectorized(phone_numbers):
    mask_two_digits = phone_numbers.str[:2].isin(lada_numbers)
    mask_three_digits = phone_numbers.str[:3].isin(lada_numbers)
    return mask_two_digits | mask_three_digits

df_ten_digits_phones.loc[:, 'lada_check'] = check_lada_vectorized(df_ten_digits_phones['phone'])

invalid_lada_phones = df_ten_digits_phones.loc[~df_ten_digits_phones['lada_check'], 'phone']
for phone in invalid_lada_phones:
    logging.error(f"Phone number does not match LADA pattern: {phone}")


In [58]:
valid_phones_df = df_ten_digits_phones.loc[df_ten_digits_phones['lada_check'] == True]
number_of_valid_phones = valid_phones_df.shape[0]

print(f"Number of phone numbers with exactly 10 digits & valid LADA codes: {number_of_valid_phones}")

Number of phone numbers with exactly 10 digits & valid LADA codes: 296


# 5. What’s the state with the most premium facilities as of today?

In [None]:
premium_counts_by_state = df[df['is_premium'] == 1]['state'].value_counts()
print(f"Counts of premium facilities by state:{premium_counts_by_state}")

most_premium_state = premium_counts_by_state.idxmax()
print(f"\nThe state with the most premium facilities is: {most_premium_state} with {premium_counts_by_state.max()} premium facilities.")


# 6. How many facilities have churned since June 2022?

In [None]:
churn_since_june_df = df[(df['is_churn'] == 1) & (df['churn_since'] > pd.to_datetime('2022-06-01'))]


In [None]:
churn_since_june_df.head()

In [None]:
churn_since_june_df["facility_id"].nunique()

# 7. What’s the top 3 states with the highest churn percentage?

In [None]:

churn_percentage_by_state = df.groupby('state')['is_churn'].mean() * 100
sorted_states = churn_percentage_by_state.sort_values(ascending=False)
top_3_states = sorted_states.head(3)

print(sorted_states, "\n\n", top_3_states)

In [None]:
campeche = df.query('state == "campeche"')
campeche

# 8. What’s the churn probability for facilities in its 5th month (at national level)?


In [None]:
df.head()

In [None]:
reference_date = datetime.now()

df['tenure_days'] = np.where(
    pd.notnull(df['churn_since']),
    (df['churn_since'] - df['premium_since']).dt.days,
    (reference_date - df['premium_since']).dt.days
)

# Fill NaN values that result from NaT entries with the placeholder -1
df['tenure_days'] = df['tenure_days'].fillna(-1)

df['tenure_months'] = (df['tenure_days'] / 30.44).astype(int)


#Starts on 0 so 5th month is 4. Also, this includes facilities that churned exactly on the 5th month mark
facilities_in_5th_month = df[(df['tenure_months'] == 4)]
facilities_in_5th_month.describe()

In [None]:
churned_in_5th_month = facilities_in_5th_month[facilities_in_5th_month['is_churn'] == 1].shape[0]
total_premium_facilities = df[pd.notnull(df['premium_since'])].shape[0]

churn_probability_5th_month = churned_in_5th_month / total_premium_facilities

print(f"The churn probability for a facility in its 5th month is: {churn_probability_5th_month:.2%}")


# 9. What’s the top 3 states with the most invalid phone number proportion?

In [None]:
df['invalid_phone'] = 0

df.loc[df['phone'].str.len() != 10, 'invalid_phone'] = 1

state_phone_stats = df.groupby('state')['invalid_phone'].agg(['sum', 'count'])

state_phone_stats["invalid_phone_proportion"] = state_phone_stats["sum"] / state_phone_stats["count"]

top_invalid_phone_states = state_phone_stats.sort_values(by='invalid_phone_proportion', ascending=False)

top_invalid_phone_states.head(3)

# 10. What’s the average facilities lifespan?


In [None]:
df.describe()

In [None]:
avg_lifespan = df["tenure_days"].mean()

print(f"The average facilities lifespan is {avg_lifespan:.0f} days.")

# 11. What’s the top 3 states with the biggest facilities?


In [None]:
big_facility_categories = ['51 a 100 personas', '101 a 250 personas', '251 y mas personas']
big_facilities_df = df[df['facility_size'].isin(big_facility_categories)]
state_category_crosstab = pd.crosstab(big_facilities_df['state'], big_facilities_df['facility_size'])

state_category_crosstab['Total'] = state_category_crosstab.sum(axis=1)

print(state_category_crosstab.head(3))

# 12. Do we have duplicated phone numbers?


In [None]:
phones_not_0 = df[df['phone'] != '0']

#We are NOT keeping the first occurence.
duplicated_phones = phones_not_0['phone'].duplicated(keep=False)

number_of_duplicated_phones = duplicated_phones.sum()

df_duplicated_phones = phones_not_0[duplicated_phones]

print(f"There are {number_of_duplicated_phones} duplicated phone numbers, excluding the placeholder '0'.\n{df_duplicated_phones['phone']}")

# 13. What’s the top 3 valid duplicated numbers?


In [None]:
df_valid_duplicated_phones = df_duplicated_phones.loc[df_duplicated_phones['invalid_phone'] == 0]

top_valid_duplicated_phones = df_valid_duplicated_phones["phone"].value_counts()

top_valid_duplicated_phones.head(3)

# 14. What’s the top 3 states with the most valid duplicated numbers?


In [None]:


valid_duplicated_phones_crosstab = pd.crosstab(df_valid_duplicated_phones['state'], df_valid_duplicated_phones['phone'])

valid_duplicated_phones_crosstab['Total'] = valid_duplicated_phones_crosstab.sum(axis=1)

valid_duplicated_phones_crosstab.head(3)

# 15. What is the relation between valid/invalid phones with churn propensity?

## Chi-squared test

In [None]:


#df_chi.drop_duplicates(subset='phone', keep='first', inplace=True)
df_no_zeros = df[df['phone'] != 0]
df_no_zeros_dropped = df_no_zeros.drop_duplicates()
df_zeros = df[df['phone'] == 0]
df_chi = pd.concat([df_no_zeros_dropped, df_zeros], ignore_index=True)

df_chi = df_chi.sort_values(by='facility_id')

df_chi = df.dropna(subset=['is_churn'])

print(df_chi.info(), "\n\n\n")

invalid_phone_counts = df_chi['invalid_phone'].value_counts()
is_churn_counts = df_chi['is_churn'].value_counts()


print(f"invalid_phone_counts: {invalid_phone_counts}\n")
print(f"is_churn_counts: {is_churn_counts}\n")

contingency_table = pd.crosstab(df_chi['invalid_phone'], df_chi['is_churn'])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi-squared test results:\n")
print(f"Chi-squared: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")
print("Expected frequencies:")
print(expected)

alpha = 0.05  
if p < alpha:
    print("There is a significant relationship between phone validity and churn propensity.")
else:
    print("There is no significant relationship between phone validity and churn propensity.")


## Logistic Regression

In [None]:
df.head()

In [None]:
df_logit = df.copy()

#df_logit = df_logit[df_logit['phone'] != '0']

#Drop the duplicates
df_logit.drop_duplicates(subset='phone', keep='first', inplace=True)

X = df_logit[['invalid_phone']]  
y = df_logit['is_churn'].dropna()

# Since 'is_churn' has missing values, we need to filter those out
X = X.loc[y.index]

X_counts = X.value_counts()

y_counts = y.value_counts()

print(f"X_counts: {X_counts}\n\ny_counts: {y_counts}")



In [None]:



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

print(f"Coefficient for invalid_phone: {model.coef_[0]}")