# Customer Analysis - Part 2

## Activity 1 - General cleaning

Import the libraries

In [2]:
import pandas as pd

Import the data from the file:

In [51]:
df = pd.read_csv("Data_Marketing_Customer_Analysis_Round2.txt")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10910 entries, 0 to 10909
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     10910 non-null  int64  
 1   Customer                       10910 non-null  object 
 2   State                          10279 non-null  object 
 3   Customer Lifetime Value        10910 non-null  float64
 4   Response                       10279 non-null  object 
 5   Coverage                       10910 non-null  object 
 6   Education                      10910 non-null  object 
 7   Effective To Date              10910 non-null  object 
 8   EmploymentStatus               10910 non-null  object 
 9   Gender                         10910 non-null  object 
 10  Income                         10910 non-null  int64  
 11  Location Code                  10910 non-null  object 
 12  Marital Status                 10910 non-null 

Dropping the "Unnamed: 0" and "customer" columns

In [52]:
df.drop(columns=["Unnamed: 0", "Customer"], inplace=True)

Standardizing header names

In [53]:
new_columns = df.columns.str.lower().str.replace(" ", "_")
df.columns = new_columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10910 entries, 0 to 10909
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          10279 non-null  object 
 1   customer_lifetime_value        10910 non-null  float64
 2   response                       10279 non-null  object 
 3   coverage                       10910 non-null  object 
 4   education                      10910 non-null  object 
 5   effective_to_date              10910 non-null  object 
 6   employmentstatus               10910 non-null  object 
 7   gender                         10910 non-null  object 
 8   income                         10910 non-null  int64  
 9   location_code                  10910 non-null  object 
 10  marital_status                 10910 non-null  object 
 11  monthly_premium_auto           10910 non-null  int64  
 12  months_since_last_claim        10277 non-null 

Drop duplicates, and reset index

In [54]:
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10776 entries, 0 to 10909
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          10163 non-null  object 
 1   customer_lifetime_value        10776 non-null  float64
 2   response                       10163 non-null  object 
 3   coverage                       10776 non-null  object 
 4   education                      10776 non-null  object 
 5   effective_to_date              10776 non-null  object 
 6   employmentstatus               10776 non-null  object 
 7   gender                         10776 non-null  object 
 8   income                         10776 non-null  int64  
 9   location_code                  10776 non-null  object 
 10  marital_status                 10776 non-null  object 
 11  monthly_premium_auto           10776 non-null  int64  
 12  months_since_last_claim        10153 non-null 

In [55]:
df = df.reset_index()

In [56]:
df.drop(columns="index", inplace=True)
df

Unnamed: 0,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,income,location_code,...,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size,vehicle_type
0,Arizona,4809.216960,No,Basic,College,2/18/11,Employed,M,48029,Suburban,...,0.0,9,Corporate Auto,Corporate L3,Offer3,Agent,292.800000,Four-Door Car,Medsize,
1,California,2228.525238,No,Basic,College,1/18/11,Unemployed,F,0,Suburban,...,0.0,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,
2,Washington,14947.917300,No,Basic,Bachelor,2/10/11,Employed,M,22139,Suburban,...,0.0,2,Personal Auto,Personal L3,Offer3,Call Center,480.000000,SUV,Medsize,A
3,Oregon,22332.439460,Yes,Extended,College,1/11/11,Employed,M,49078,Suburban,...,0.0,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A
4,Oregon,9025.067525,No,Premium,Bachelor,1/17/11,Medical Leave,F,23675,Suburban,...,,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10771,Nevada,15563.369440,No,Premium,Bachelor,1/19/11,Unemployed,F,0,Suburban,...,,7,Personal Auto,Personal L1,Offer3,Web,1214.400000,Luxury Car,Medsize,A
10772,Oregon,5259.444853,No,Basic,College,1/6/11,Employed,F,61146,Urban,...,0.0,6,Personal Auto,Personal L3,Offer2,Branch,273.018929,Four-Door Car,Medsize,A
10773,Arizona,23893.304100,No,Extended,Bachelor,2/6/11,Employed,F,39837,Rural,...,0.0,2,Corporate Auto,Corporate L3,Offer1,Web,381.306996,Luxury SUV,Medsize,
10774,California,11971.977650,No,Premium,College,2/13/11,Employed,F,64195,Urban,...,4.0,6,Personal Auto,Personal L1,Offer1,Branch,618.288849,SUV,Medsize,A


In [57]:
df.drop(columns="vehicle_type", inplace=True)

### Replacing null values

In [58]:
df["number_of_open_complaints"] = df["number_of_open_complaints"].fillna(value=0)

In [59]:
mean_months = round(df["months_since_last_claim"].mean())
df["months_since_last_claim"] = df["months_since_last_claim"].fillna(value=mean_months)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10776 entries, 0 to 10775
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          10163 non-null  object 
 1   customer_lifetime_value        10776 non-null  float64
 2   response                       10163 non-null  object 
 3   coverage                       10776 non-null  object 
 4   education                      10776 non-null  object 
 5   effective_to_date              10776 non-null  object 
 6   employmentstatus               10776 non-null  object 
 7   gender                         10776 non-null  object 
 8   income                         10776 non-null  int64  
 9   location_code                  10776 non-null  object 
 10  marital_status                 10776 non-null  object 
 11  monthly_premium_auto           10776 non-null  int64  
 12  months_since_last_claim        10776 non-null 

## Activity 2

Bucketing the data - Write a function to replace column "State" to different zones. California as West Region, Oregon as North West, and Washington as East, and Arizona and Nevada as Central

In [72]:
df1 = df.copy()

In [73]:
df1["state"].value_counts()

California    3512
Oregon        2861
Arizona       1918
Nevada         987
Washington     885
Name: state, dtype: int64

In [74]:
def region(x):
    if x != x:
        return x
    else:
        dic = {"California": "West Region", "Oregon": "North West", "Washington": "East",
               "Arizona": "Central", "Nevada": "Central"}
        return dic[x]
    
df1["region"] = df1["state"].apply(region)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10776 entries, 0 to 10775
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          10163 non-null  object 
 1   customer_lifetime_value        10776 non-null  float64
 2   response                       10163 non-null  object 
 3   coverage                       10776 non-null  object 
 4   education                      10776 non-null  object 
 5   effective_to_date              10776 non-null  object 
 6   employmentstatus               10776 non-null  object 
 7   gender                         10776 non-null  object 
 8   income                         10776 non-null  int64  
 9   location_code                  10776 non-null  object 
 10  marital_status                 10776 non-null  object 
 11  monthly_premium_auto           10776 non-null  int64  
 12  months_since_last_claim        10776 non-null 

In [75]:
df1["region"].value_counts()

West Region    3512
Central        2905
North West     2861
East            885
Name: region, dtype: int64

# Activity 3

Datetime format - Extract the months from the dataset and store in a separate column. Then filter the data to show only the information for the first quarter , ie. January, February and March. Hint: If data from March does not exist, consider only January and February.

In [81]:
df2 = df1.copy()


In [83]:
df2["effective_to_date"]

0        2/18/11
1        1/18/11
2        2/10/11
3        1/11/11
4        1/17/11
          ...   
10771    1/19/11
10772     1/6/11
10773     2/6/11
10774    2/13/11
10775     1/8/11
Name: effective_to_date, Length: 10776, dtype: object

In [94]:
#Storing the new dates column:
df2["date_column"] = pd.to_datetime(df2["effective_to_date"])
df2["date_column"].value_counts()

2011-01-27    238
2011-01-10    236
2011-01-17    220
2011-01-26    217
2011-02-14    217
2011-01-31    217
2011-01-19    212
2011-01-03    211
2011-02-27    202
2011-01-28    199
2011-02-04    198
2011-01-21    198
2011-02-26    197
2011-01-20    195
2011-02-19    195
2011-02-07    194
2011-01-11    193
2011-02-03    193
2011-01-05    190
2011-02-22    189
2011-02-28    188
2011-01-02    186
2011-02-10    184
2011-02-11    183
2011-02-18    183
2011-02-25    182
2011-01-18    182
2011-02-12    181
2011-02-02    181
2011-01-23    180
2011-01-29    179
2011-01-08    179
2011-01-14    178
2011-02-01    178
2011-02-05    178
2011-01-09    177
2011-02-09    177
2011-01-24    176
2011-02-21    175
2011-01-07    175
2011-02-23    173
2011-01-15    172
2011-01-16    172
2011-01-13    171
2011-02-06    170
2011-01-01    168
2011-01-30    168
2011-02-13    167
2011-02-16    166
2011-01-25    166
2011-02-24    162
2011-02-17    160
2011-01-06    160
2011-02-20    156
2011-02-15    155
2011-02-08

In [95]:
df2["month_column"] = df2["date_column"].apply(lambda x: x.month)

In [96]:
df2["month_column"].value_counts()

1    5737
2    5039
Name: month_column, dtype: int64