# Customer Retention Analysis 

### Read in the data and take a peek

In [1]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('dummy_customer_file.csv')

# Display the first few rows to verify the data loaded correctly
print(df.head())



               oid provider  total_charges signup_date is_canceled is_active  \
0  273624174263463    apple          10.94  2021-04-20        True     False   
1  532534633483904    apple          10.92  2021-04-20        True     False   
2  623527156674739    apple           0.00  2021-04-20       False     False   
3  441616357320658    apple          90.30  2021-04-20       False      True   
4  189644194376891    apple          79.98  2021-04-20       False      True   

  is_delinquent conversion_date              cancellation_date  current_mrr  \
0         False      2021-04-20  2021-05-20T12:00:00.000+00:00         0.00   
1         False      2021-04-20  2021-06-20T12:00:00.000+00:00         0.00   
2         False             NaN                            NaN         0.00   
3         False      2021-04-27                            NaN         3.47   
4         False      2021-05-04                            NaN         3.33   

                         personal_person_geo

  df = pd.read_csv('dummy_customer_file.csv')


In [2]:
# Basic information about the DataFrame
print("\nDataFrame Info:")
print(df.info())


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134612 entries, 0 to 134611
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   oid                          134610 non-null  object 
 1   provider                     134610 non-null  object 
 2   total_charges                134610 non-null  float64
 3   signup_date                  134610 non-null  object 
 4   is_canceled                  134610 non-null  object 
 5   is_active                    134610 non-null  object 
 6   is_delinquent                134610 non-null  object 
 7   conversion_date              80759 non-null   object 
 8   cancellation_date            33518 non-null   object 
 9   current_mrr                  134610 non-null  float64
 10  personal_person_geo_country  105503 non-null  object 
 11  converted                    134610 non-null  object 
dtypes: float64(2), object(10)
memory usage: 1

### Look and clean up NaN values
- Clean up only the rows where all columns are null

In [3]:
# Check for any remaining missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
oid                                 2
provider                            2
total_charges                       2
signup_date                         2
is_canceled                         2
is_active                           2
is_delinquent                       2
conversion_date                 53853
cancellation_date              101094
current_mrr                         2
personal_person_geo_country     29109
converted                           2
dtype: int64


In [4]:
#Look at rows where oid is invalid, entire row shows up invalid
#null_rows = df[df.isnull().any(axis=1)]
null_rows = df[df['oid'].isnull()]
print("\nRows with missing values:")
print(null_rows)


Rows with missing values:
        oid provider  total_charges signup_date is_canceled is_active  \
134610  NaN      NaN            NaN         NaN         NaN       NaN   
134611  NaN      NaN            NaN         NaN         NaN       NaN   

       is_delinquent conversion_date cancellation_date  current_mrr  \
134610           NaN             NaN               NaN          NaN   
134611           NaN             NaN               NaN          NaN   

       personal_person_geo_country converted  
134610                         NaN       NaN  
134611                         NaN       NaN  


In [5]:
#Remove the 2 rows that are completely invalid, bad data sourcing 
# Drop rows where 'oid' is null, keep all columns
df2 = df.copy()

df2 = df2.dropna(subset=['oid'])

# Optional: verify the change
print(f"Original DataFrame shape: {df.shape}")
print(f"New DataFrame shape: {df2.shape}")

Original DataFrame shape: (134612, 12)
New DataFrame shape: (134610, 12)


### Now lets make all the date time columns converted to make easier
- Also removed the uniform timestamp to simplify 

In [6]:
# Convert date columns to proper datetime format, ditch the timestamp on cancellation date
df2['signup_date'] = pd.to_datetime(df2['signup_date'])
df2['conversion_date'] = pd.to_datetime(df2['conversion_date'])
df2['cancellation_date'] = pd.to_datetime(df2['cancellation_date'])
df2['cancellation_date'] = df2['cancellation_date'].dt.date
df2['cancellation_date'] = pd.to_datetime(df2['cancellation_date'])


### Lets standardize the cuountries in the personal_person_geo_country column

- Just going to make all of them equal to the more popular one
- Had AI scan the csv of the columna and return mapping logic


In [7]:
# Set pandas display options to show all rows
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Don't wrap long strings
pd.set_option('display.max_colwidth', None)  # Show full content of each column

# Now print your value counts
print(df2['personal_person_geo_country'].value_counts())
print(df2['personal_person_geo_country'].nunique())

# Optional: Reset to defaults after viewing
pd.reset_option('display.max_rows')

personal_person_geo_country
United States of America                                39021
United Kingdom of Great Britain and Northern Ireland     8541
Germany                                                  8362
France                                                   4301
Canada                                                   4018
Italy                                                    3126
Australia                                                3105
Russian Federation                                       3048
Japan                                                    2166
Switzerland                                              1653
Netherlands                                              1607
Spain                                                    1531
China                                                    1482
Sweden                                                   1259
United States                                            1206
Brazil                                    

In [8]:


# Create a mapping of non-standard country names to their standardized versions
# Using the more popular version of each country name
country_mapping = {
    'United States': 'United States of America',
    'United Kingdom': 'United Kingdom of Great Britain and Northern Ireland',
    'Russia': 'Russian Federation',
    'Turkey': 'Türkiye',  # Note: Here 'Turkey' is more popular, but 'Türkiye' is the official name now
    'Vietnam': 'Viet Nam',
    'Taiwan': 'Taiwan, Province of China',
    'Bolivia': 'Bolivia (Plurinational State of)',
    'Venezuela': 'Venezuela (Bolivarian Republic of)',
    'Macedonia (FYROM)': 'North Macedonia',
    'Bahamas': 'The Bahamas',
    'South Korea': 'Korea (Republic of)'
}

# Apply the mapping to standardize country names
df2['personal_person_geo_country'] = df2['personal_person_geo_country'].replace(country_mapping)
df2['personal_person_geo_country'].nunique()



160

In [9]:
df2['oid'].nunique()

134602

In [10]:
# Find all rows where 'oid' appears more than once
duplicate_rows = df2[df2['oid'].duplicated(keep=False)]

# Display these duplicate rows
print(duplicate_rows.count())
print("\nRows with duplicate 'oid' values:")
print(duplicate_rows)


# Alternatively, to sort them by 'oid' for easier viewing:
print(duplicate_rows.sort_values('oid'))

oid                            14
provider                       14
total_charges                  14
signup_date                    14
is_canceled                    14
is_active                      14
is_delinquent                  14
conversion_date                12
cancellation_date               3
current_mrr                    14
personal_person_geo_country    14
converted                      14
dtype: int64

Rows with duplicate 'oid' values:
             oid provider  total_charges signup_date is_canceled is_active  \
59545   1.68E+15    apple          71.91  2022-05-04       False      True   
59645  1.736E+15    apple           0.00  2022-05-04       False     False   
59670  1.736E+15    apple           9.85  2022-05-04        True     False   
59747   1.68E+15    apple           0.74  2022-05-05       False      True   
59783   1.68E+15    apple          59.99  2022-05-05       False      True   
61994  1.743E+15    apple          55.93  2022-05-17       False      True  

### This seems like good general cleaning, can come back and add to this file if we find any other data quality concerns

- If any of the above code was to be added to we should document change to understand how the report tables changed over time

In [11]:
import pandas as pd
import sqlite3
from datetime import datetime
import os

# Assuming df2 is your DataFrame with the processed country data
# First, get the latest sign up date to use in the table name
latest_signup_date = df2['signup_date'].max()  # Adjust column name if needed
formatted_date = latest_signup_date.strftime('%Y%m%d')  # Format as YYYYMMDD

# Define the table name with the date
table_name = f"subscription_data_{formatted_date}"

# Path to your SQLite database
db_path = "Subscriptions.db"  # Adjust to your actual database path

# Check if database exists
db_exists = os.path.exists(db_path)

# Connect to the SQLite database (creates it if it doesn't exist)
conn = sqlite3.connect(db_path)

# Check if the table exists and drop it if it does
cursor = conn.cursor()
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
conn.commit()

# Write the DataFrame to the SQLite table
df2.to_sql(name=table_name, con=conn, if_exists='replace', index=False)

# Close the connection
conn.close()

print(f"Data successfully written to table '{table_name}' in {db_path}")

Data successfully written to table 'subscription_data_20230116' in Subscriptions.db


## Now just check to make sure table was uploaded correctly 

In [12]:
import sqlite3

# Path to your SQLite database
db_path = "Subscriptions.db"  # Adjust to your actual database path

# Connect to the database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# 1. List all tables in the database
print("All tables in the database:")
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
for table in tables:
    print(table[0])

# 2. Get the head of your newly created table
# (assuming the table name pattern is subscription_data_YYYYMMDD)
# Find the subscription_data table
subscription_tables = [table[0] for table in tables if table[0].startswith('subscription_data_')]

if subscription_tables:
    # Get the most recent table (should be the one you just created)
    latest_table = sorted(subscription_tables)[-1]
    
    print(f"\nFirst 5 rows from table '{latest_table}':")
    cursor.execute(f"SELECT * FROM {latest_table} LIMIT 5;")
    
    # Get column names
    column_names = [description[0] for description in cursor.description]
    print("Columns:", column_names)
    
    # Print rows
    rows = cursor.fetchall()
    for row in rows:
        print(row)
    
    # Also show a count of total rows
    cursor.execute(f"SELECT COUNT(*) FROM {latest_table};")
    row_count = cursor.fetchone()[0]
    print(f"\nTotal rows in '{latest_table}': {row_count}")
    
     # Add column count
    column_count = len(column_names)
    print(f"Total columns in '{latest_table}': {column_count}")
else:
    print("No subscription_data tables found.")

# Close the connection
conn.close()

All tables in the database:
subscription_data_20230116

First 5 rows from table 'subscription_data_20230116':
Columns: ['oid', 'provider', 'total_charges', 'signup_date', 'is_canceled', 'is_active', 'is_delinquent', 'conversion_date', 'cancellation_date', 'current_mrr', 'personal_person_geo_country', 'converted']
('273624174263463', 'apple', 10.94, '2021-04-20 00:00:00', 1, 0, 0, '2021-04-20 00:00:00', '2021-05-20 00:00:00', 0.0, 'France', 1)
('532534633483904', 'apple', 10.92, '2021-04-20 00:00:00', 1, 0, 0, '2021-04-20 00:00:00', '2021-06-20 00:00:00', 0.0, 'United Kingdom of Great Britain and Northern Ireland', 1)
('623527156674739', 'apple', 0.0, '2021-04-20 00:00:00', 0, 0, 0, None, None, 0.0, 'Germany', 0)
('441616357320658', 'apple', 90.3, '2021-04-20 00:00:00', 0, 1, 0, '2021-04-27 00:00:00', None, 3.47, 'Switzerland', 1)
('189644194376891', 'apple', 79.98, '2021-04-20 00:00:00', 0, 1, 0, '2021-05-04 00:00:00', None, 3.33, 'United States of America', 1)

Total rows in 'subscrip