In [1]:
# import necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Reading a CSV file named 'renewal_app_df.csv' from the '_data' directory into a pandas DataFrame called 'renewal_raw_df'
renewal_raw_df = pd.read_csv('../_data/renewal_app_df.csv')
# Converts the 'customer_id' column to string type, then pads each value with leading zeros to ensure a length of 8 characters.
renewal_raw_df['customer_id'] = renewal_raw_df['customer_id'].astype(str).str.zfill(8)

In [3]:
# Drop duplicates based on customer_id, keeping only the last occurrence
renewal_raw_df = renewal_raw_df.drop_duplicates(subset='customer_id', keep='last')

In [4]:
# Create a new DataFrame 'new_joined_before_2017_df' by filtering rows from 'tenure_cal_df'.
new_joined_before_2017_df = pd.DataFrame(renewal_raw_df[renewal_raw_df['MOST_RECENT_ADD_DATE'] < '2017-01-01'])

In [5]:
# Filter rows from 'renewed_indicator_df' where the 'cycle_begin_date' column has a value of '0'.
one_renewal_date_df=renewal_raw_df[renewal_raw_df['cycle_begin_date']=='0']

In [6]:
# Combine the 'customer_id' columns from 'new_joined_before_2017_df' and 'one_renewal_date_df' into a single Series.
exclude_ids = pd.concat([new_joined_before_2017_df['customer_id'], one_renewal_date_df['customer_id']]).drop_duplicates(keep=False)

# Filter rows from 'renewal_raw_df' where the 'customer_id' is not in the 'exclude_ids' list.
renewal_filtered_df = renewal_raw_df[~renewal_raw_df['customer_id'].isin(exclude_ids)] # This creates a new DataFrame 'renewal_filtered_df' that excludes these IDs.

In [7]:
# Create a new DataFrame 'renwal_df' by dropping unnecessary columns from 'renewal_filtered_df'.
renewal_df = renewal_filtered_df.drop(['MOST_RECENT_ADD_DATE', 'CYCLE_BEGIN_DATE', 'CYCLE_END_DATE', 'GRACE_DATE', 'Group Member Dues',
                                      'PAYMENT_STATUS', 'AS_OF_DATE', 'ABASET_SUBCODE_DESCR', 'cycle_begin_date', 'Member Dues',
                                      'cycle_end_date', 'product_code', 'order_no', 'order_line_no', 'grace_date', 'DOB',
                                      'the_rank', 'member_renewal_indicator', 'earliest_begin_date', 'order_count'], axis=1) # The 'axis=1' parameter specifies that these columns are being removed (as opposed to rows).

In [8]:
# Assign new columns for bundled data to the `renewal_df` dataframe by summing specific groups of columns
renewal_df = renewal_df.assign(
    article_order=renewal_df[['Article Download', 'Journal', 'Magazine', 'Newsletter', 'Single Issue']].sum(axis=1),
    books_order=renewal_df[['Book', 'E-Book', 'Chapter Download']].sum(axis=1),
    contribution_order=renewal_df[['Contribution', 'Donation']].sum(axis=1),
    digital_education_order=renewal_df[['Webinar', 'On-Demand']].sum(axis=1),
    ecd_misc_order=renewal_df[['Course Materials Download']].sum(axis=1),
    events_misc_order=renewal_df[['Product', 'Exhibitor', 'Sponsorship Non-UBIT', 'Sponsorship UBIT']].sum(axis=1),
    inventory_misc_order=renewal_df[['Brochure', 'CD-ROM', 'Directory', 'Errata', 'Letter', 'Loose Leaf', 'Pamphlet', 'Standing Order']].sum(axis=1),
    meeting_order=renewal_df[['Meeting', 'Virtual Meeting', 'Invite Only Meeting', 'ABA Midyear', 'In-Person']].sum(axis=1),
    merchandise_order=renewal_df[['General Merchandise', 'Clothing']].sum(axis=1),
    misc_order=renewal_df[['Audio Download', 'Inventory Product Package']].sum(axis=1)
).drop(columns=[
    # Drop all the original columns that were summed into the new columns
    'Article Download', 'Journal', 'Magazine', 'Newsletter', 'Single Issue',
    'Book', 'E-Book', 'Chapter Download', 'Contribution', 'Donation',
    'Webinar', 'On-Demand', 'Course Materials Download','Product',
    'Exhibitor', 'Sponsorship Non-UBIT', 'Sponsorship UBIT',
    'Brochure', 'CD-ROM', 'Directory', 'Errata', 'Letter',
    'Loose Leaf', 'Pamphlet', 'Standing Order','Meeting',
    'Virtual Meeting', 'Invite Only Meeting','ABA Midyear',
    'In-Person', 'General Merchandise', 'Clothing',
    'Audio Download', 'Inventory Product Package'
])

In [9]:
# Define categories
territories = ['AS', 'FM', 'GU', 'MH', 'MP', 'PW', 'PR', 'VI']
military = ['AP', 'AE']
top_5_states = ['CA', 'NY', 'TX', 'IL', 'FL']
northeast = ['CT', 'ME', 'MA', 'NH', 'RI', 'VT', 'NJ', 'PA']
midwest = ['IN', 'IA', 'KS', 'MI', 'MN', 'MO', 'NE', 'ND', 'OH', 'SD', 'WI']
south = ['DE', 'GA', 'KY', 'MD', 'NC', 'SC', 'TN', 'VA', 'WV', 'AL', 'MS', 'AR', 'LA', 'OK', 'DC']
west = ['AK', 'AZ', 'CO', 'HI', 'ID', 'MT', 'NV', 'NM', 'OR', 'UT', 'WA', 'WY']

# Function to categorize
def categorize_state(state):
    clean_state = str(state).strip().upper()
    if state in top_5_states:
        return state
    elif state in northeast:
        return 'Northeast'
    elif state in midwest:
        return 'Midwest'
    elif state in south:
        return 'South'
    elif state in west:
        return 'West'
    elif state in territories:
        return 'Territories'
    elif state in military:
        return 'Overseas military'
    else:
        return 'International'

# Apply categorization
renewal_df['STATE'] = renewal_df['STATE'].apply(categorize_state)

In [10]:
practice_type = ['0', 'Solo Practitioner', 'Government', 'Small Firm (2-5 Attorneys)', 'Corporate',
            'Private Practice (6+ Attorneys)', 'Non-Profit','Public Interest/Legal Aid']
# Function to categorize practice
def categorize_practice(practice):
    clean_state = str(practice).strip().lower()
    if practice in practice_type:
        return practice
    else:
       return 'Other'
    
# Apply categorization
renewal_df['ABASET_CODE_DESCR'] = renewal_df['ABASET_CODE_DESCR'].apply(categorize_practice)

In [11]:
renewal_df.columns = renewal_df.columns.str.lower()
renewal_df.columns = renewal_df.columns.str.replace(' ', '_')
renewal_df.columns = renewal_df.columns.str.replace('-', '_')

In [12]:
# dropping the following columns as they are imbalanced as per the eda report
renewal_df = renewal_df.drop(['events_cle', 'misc_order', 'disability_indicator', 'ethnicity_code',
                              'auto_enroll_section_count', 'gender_code', 'descr'], axis=1)

#### Frequency Encoding

In [14]:
# List of categorical columns to frequency-encode
categorical_cols = ['abaset_code_descr', 'state']

# Apply frequency encoding using a loop
for col in categorical_cols:
    frequency  = renewal_df[col].value_counts(normalize=True)
    renewal_df[col + '_encoded'] = renewal_df[col].map(frequency)

In [15]:
columns_to_check= ['dues_required_section_count', 'no_charge_section_count', 'member_groups', 'article',
                   'books','on_demand_video','news_aba', 'podcast', 'aba_advantage', 'article_order', 'age',
                   'books_order', 'contribution_order', 'digital_education_order', 'ecd_misc_order',
                   'events_misc_order', 'inventory_misc_order', 'meeting_order', 'merchandise_order']

In [16]:
skewness= renewal_df[columns_to_check].skew()
## < ±0.5: Fairly symmetrical (no need to transform)
## 0.5–1: Moderate skewness (may need transformation)
## > 1: Highly skewed (need transformation) log transformation
columns_to_log_transform = skewness[skewness > 0.5].index.tolist()
print("Columns selected for log transformation based on skewness > 0.5:")
print(columns_to_log_transform)

renewal_df_log = renewal_df.copy()
renewal_df_log[columns_to_log_transform] = renewal_df_log[columns_to_log_transform].apply(np.log1p)

Columns selected for log transformation based on skewness > 0.5:
['dues_required_section_count', 'no_charge_section_count', 'member_groups', 'article', 'books', 'on_demand_video', 'news_aba', 'podcast', 'aba_advantage', 'article_order', 'age', 'books_order', 'contribution_order', 'digital_education_order', 'ecd_misc_order', 'events_misc_order', 'inventory_misc_order', 'meeting_order', 'merchandise_order']


In [17]:
renewal_df_log=renewal_df_log.drop(['abaset_code_descr', 'state', # these columns are already encoded
                                    # these columns came low on feature importance as per RFE feature selection
                                    'ecd_misc_order', 'events_misc_order', 'inventory_misc_order', 'merchandise_order', 'article_order',
                                    'news_aba', 'contribution_order', 'podcast', 'books_order', 'on_demand_video'], axis=1)

In [18]:
renewal_df_log.head()

Unnamed: 0,customer_id,dues_required_section_count,no_charge_section_count,member_renewed_indicator,member_groups,article,books,aba_advantage,age,digital_education_order,meeting_order,abaset_code_descr_encoded,state_encoded
0,4388002,0.0,0.0,1.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.103893,0.066856
1,5844918,0.693147,0.693147,1.0,1.94591,2.397895,0.0,0.0,0.0,1.098612,0.0,0.101171,0.280795
2,3240565,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,3.637586,0.0,0.0,0.0878,0.280795
3,3215169,0.693147,0.0,0.0,1.098612,0.0,0.0,0.0,3.496508,0.0,0.0,0.103893,0.113123
5,3208841,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,3.401197,0.0,0.0,0.083895,0.10555
