In [80]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [81]:
# loading data from leads_basic_details.csv
df1 = pd.read_csv('leads_basic_details.csv')
df2 = pd.read_csv('leads_demo_watched_details.csv')
df3 = pd.read_csv('leads_interaction_details.csv')
df4 = pd.read_csv('leads_reasons_for_no_interest.csv')
df5 = pd.read_csv('sales_managers_assigned_leads_details.csv')

# Data Cleaning

In [82]:
print('SHAPE OF DATASETS \n')
print('leads_basic_details: ', df1.shape)
print('leads_demo_watched_details', df2.shape)
print('leads_interaction_details', df3.shape)
print('leads_reasons_for_no_interest', df4.shape)
print('sales_managers_assigned_leads_details', df5.shape)

SHAPE OF DATASETS 

leads_basic_details:  (360, 7)
leads_demo_watched_details (194, 4)
leads_interaction_details (2192, 6)
leads_reasons_for_no_interest (294, 4)
sales_managers_assigned_leads_details (360, 5)


In [83]:
print('DUPLICATES COUNT \n')
print('leads_basic_details: ', df1.duplicated().sum())
print('leads_demo_watched_details: ', df2.duplicated().sum())
print('leads_interaction_details: ', df3.duplicated().sum())
print('leads_reasons_for_no_interest: ', df4.duplicated().sum())
print('sales_managers_assigned_leads_details: ', df5.duplicated().sum())

DUPLICATES COUNT 

leads_basic_details:  0
leads_demo_watched_details:  0
leads_interaction_details:  438
leads_reasons_for_no_interest:  0
sales_managers_assigned_leads_details:  0


> Notice that there are duplicate records in leads_interaction_details dataset.
> Will need to drop these records before using the data for analysis.

In [84]:
df3.drop_duplicates(inplace=True)
print('leads_interaction_details: ', df3.duplicated().sum())

leads_interaction_details:  0


In [None]:
print('NULL VALUE COUNT\n')
print('leads_basic_details:\n\n', df1.isnull().sum())
print('leads_demo_watched_details:\n\n', df2.isnull().sum())
print('leads_interaction_details:\n\n', df3.isnull().sum())
print('leads_reasons_for_no_interest:\n\n', df4.isnull().sum())
print('sales_managers_assigned_leads_details:\n\n', df5.isnull().sum())

> Look for null values, duplic, inconsistencies in data
> Start with dataset from df1 through df5

### Dataframe df1

In [None]:
df1.info()

In [None]:
df1.describe()

> It is observe there is huge gap between the 75% value and the max value. 211 years of age is an unrealistic number and appears to be an outlier. Lets see how the plot verifies this.

In [None]:
plt.figure(figsize=(13,5))
plt.suptitle("Distribution of age", fontsize= 20, fontweight = 5, color = 'Red')
ticks = np.arange(0,200,20)
label = ["{}".format(i) for i in ticks]
plt.yticks(ticks,label)
sns.boxplot(y=df1.age)

plt.show()

> The above shows there is more than one outliers. There are two. It is necessary to remove these outliers before this data can be used for further analysis.

In [None]:
# Seperating values greater than 90 percentile as outliers and storing into a different dataset. 
df1_outlier = df1[df1.age>df1.age.quantile(0.90)]
# Displaying the outliers
df1_outlier

In [None]:
# Removing the outliers for df1 dataset
df1 = df1.drop([df1_outlier.index[0] , df1_outlier.index[1]])

In [None]:
df1.shape

In [None]:
# Re-plotting age distribution after handling the outliers
plt.figure(figsize=(13,5))
plt.suptitle("Distribution of age", fontsize= 20, fontweight = 5, color = 'Red')
ticks = np.arange(0,40,1)
label = ["{}".format(i) for i in ticks]
plt.yticks(ticks,label)
sns.boxplot(y=df1.age)
plt.show()

> The above shows that outliers are no longer present in df1 data.

> Majority of the students are between the ages 18 and 24 years.

In [None]:
df1.gender.value_counts()

In [None]:
df1.columns

In [None]:
df1.current_education.value_counts()

> 'Intermediate' and 'Intermediate Completed' are the same.

> Therefore rename one of these values to draw more accurate results in further analysis.

In [None]:
df1['current_education'].replace({'Intermediate Completed':'Intermediate'}, inplace=True)

### Dataframe df2

In [None]:
df2.info()

In [None]:
df2.describe()

> It is observe that the max watched_percentage is beyond 100 which is inaccurate or unrealistic. Let see how the plot verifies the outliers in this field.

In [None]:
# Plotting Watched Percentage
plt.figure(figsize=(13,5))
plt.suptitle("Distribution of Watched_Percentage", fontsize= 25, fontweight = 5, color = 'Red')
ticks = np.arange(0,200,20)
label = ["{}".format(i) for i in ticks]
plt.yticks(ticks,label)
sns.boxplot(y=df2.watched_percentage)

plt.show()

> It shows there are not one but two outliers that are beyond the 100%. These outliers must be removed from teh dataset to normalize the data and be used for analysis.

In [None]:
# Identifying outliers whose watched percentage is beyond 100
df2_outlier = df2[df2.watched_percentage>100]
df2_outlier

In [None]:
# Removing the outliers from df2 dataset
df2 = df2.drop([df2_outlier.index[0] , df2_outlier.index[1]])
df2.shape

In [None]:
# Plotting watched percentage after handling outliers
plt.figure(figsize=(13,5))
plt.suptitle("Distribution of Watched_Percentage", fontsize= 25, fontweight = 5, color = 'Black')
ticks = np.arange(0,200,20)
label = ["{}".format(i) for i in ticks]
plt.yticks(ticks,label)
sns.boxplot(y=df2.watched_percentage)

plt.show()

In [None]:
df2 = df2.drop(['demo_watched_date'], axis=1)

In [None]:
df2

### Dataframe df3

In [None]:
df3.info()

In [None]:
df3 = df3.drop(['call_done_date'], axis=1)
df3.head()

In [None]:
#Identify and drop duplicates
print(df3.duplicated().sum())
df3.drop_duplicates(inplace=True)

In [None]:
# Renaming the stages according the the last stage each lead was in.
df3['lead_stage'].replace({'lead':'Lead Stage','lead-awareness': 'Awareness Stage', 'lead-awareness-consideration':'Consideration Stage', 'lead-awareness-consideration-conversion':'Conversion Stage' }, inplace=True)

In [None]:
#Verifying count of leads in each stage
df3.lead_stage.value_counts()

### Dataframe df4

In [None]:
df4.info()

> Observe that the leads_reasons_for_no_interest dataset has null values in its columns.

> Up on observation, it could be noticed that these columns in the dataset are all speaking of the reason why the lead did not show interest at any given stage (i.e reason for not interested).

> In this scenario, it is advisable to combine the values of these three columns into a single column named 'reason_for_not_interested' and carry out further analysis.

In [None]:
# Replacing Null values with an empty string
df4['reasons_for_not_interested_in_demo'] = df4['reasons_for_not_interested_in_demo'].fillna('')
df4['reasons_for_not_interested_to_consider'] = df4['reasons_for_not_interested_to_consider'].fillna('')
df4['reasons_for_not_interested_to_convert'] = df4['reasons_for_not_interested_to_convert'].fillna('')

In [None]:
# Combine all the values in the existing columns and store them in the new column.
df4['Reason_for_not_interested'] = df4['reasons_for_not_interested_in_demo'] + df4['reasons_for_not_interested_to_consider'] + df4['reasons_for_not_interested_to_convert']

In [None]:
# Drop the redundant columns that are no longer required for analysis
df4 = df4.drop(['reasons_for_not_interested_in_demo','reasons_for_not_interested_to_consider', 'reasons_for_not_interested_to_convert'], axis=1)

In [None]:
# Now leads_reasons_for_no_interest will be having 'lead_id' and 'Reason_for_not_interested' in its dataset that can be used for further analysis.
df4.info()

In [None]:
df4.Reason_for_not_interested.value_counts()

> There are two values in 'Reason_for_not_interested' column that mean the same.

> One is 'Can't afford' and another is 'Cannot afford'

> Rename one of these values for more accurate results.

In [None]:
df4['Reason_for_not_interested'].replace({"Can't afford":'Cannot afford'}, inplace=True)

### Dataframe df5

In [None]:
df5.info()

In [None]:
df5 = df5.drop(['assigned_date'], axis=1)
df5.shape

In [None]:
# Removing the outliers identified in df1 and df2 from df5
df5 = df5.drop([df1_outlier.index[0] , df1_outlier.index[1], df2_outlier.index[0] , df2_outlier.index[1]])

In [None]:
print('SHAPE OF DATASETS \n')
print('leads_basic_details: ', df1.shape)
print('leads_demo_watched_details', df2.shape)
print('leads_interaction_details', df3.shape)
print('leads_reasons_for_no_interest', df4.shape)
print('sales_managers_assigned_leads_details', df5.shape)

In [None]:
L1 = pd.merge(df1,df3)
L1.info()

In [None]:
L2 = pd.merge(L1,df5)
L2.info()

In [None]:
L3 = pd.merge(L2, df4, how = 'left', on='lead_id')
L3.info()

In [None]:
L3['Reason_for_not_interested'] = L3['Reason_for_not_interested'].fillna('Shown Interest')

In [None]:
L3.info()

In [None]:
L4 = pd.merge(L3,df2,how='left',on='lead_id')
L4.info()

In [None]:
L4['language'] = L4['language'].fillna('Info Unavailable')

In [None]:
L4['watched_percentage'] = L4['watched_percentage'].fillna('0')

In [None]:
L4.info()

In [None]:
L4.duplicated().sum()

In [None]:
L4['Lead_Status'] = np.where(L4['lead_stage'] == 'Conversion Stage', 'Converted Lead', 'Yet to convert Lead')

In [None]:
L4.head()

## Analysis

In [None]:
sns.displot(L4["age"],palette="Blues_d")
plt.title('Age Distribution', fontsize=18)
plt.xlabel('Age', fontsize=16)
plt.ylabel('No of Leads', fontsize=16)

From the above plot it could be observe that maximum no leads are 18 to 20 years old.

In [None]:
ax = L4.groupby('gender').size().plot(kind='pie', autopct='%.2f',  y='lead_id', ylabel='')
L4["gender"].value_counts()

In [None]:
ax = L4.groupby('current_education').size().plot(kind='pie', autopct='%.2f',  y='lead_id', ylabel='')
ax.set_title("Current Education")

> From the above chart we observe majority of leads are pursuing higher education or are in search of job.

In [None]:
ax = L4.groupby('lead_gen_source').size().plot(kind='pie', autopct='%.2f',  y='lead_id', ylabel='')
ax.set_title("Lead Source")

Around 24% of the leads came to know about the firm through social media. Other major sources contributing are SEO and email_marketing which contribute around 20% each.

In [None]:
sns.displot(df2.watched_percentage,palette="Blues_d",kde=True)
plt.title('Watched Percentage', fontsize=18)
plt.xlabel('Demo Video Lenght in %', fontsize=16)
plt.ylabel('No of Leads watching video', fontsize=16)

From the above graph we can say more than 35 leads watched around 70-80% of the video. Around 10 leads watched the whole demo video.Through analysing the curve we can say majority of the leads population watch demo video over 40%

In [None]:
merged_leads_demo = pd.merge(df2, df1, how='inner', on = 'lead_id')
merged_leads_demo .head(5)

In [None]:
merged_leads_demo.groupby(['language', 'current_city']).size()

English language is dominant while Telugu is mainly being used in Hyderabad and Vishakapatnam

In [None]:
ax = merged_leads_demo.groupby('language').size().plot(kind='pie', autopct='%.2f',  y='lead_id', ylabel='')
ax.set_title("Demo Language")

From the above pie, it could be seen that around 58% of customers prefer watching Demo Video in English making it the major language. About 32% of the leads population prefer watching Demo in Telegu and the remaining 10% prefer Hindi.

In [None]:
merged_interaction_demo = pd.merge(df2, df3, how='inner', on = 'lead_id')
merged_interaction_demo

In [None]:
merged_interaction_demo.groupby([ 'call_status','call_reason']).size()

**Sales managers require to focus on earlier stages of acquisition process. Work needs to be done at the primary stages to imbibe curiosity among the leads to watch Demo Videos**

In [None]:
df4.describe()

From the above table it can be observed that There are 5 unique reasons for students not being interested of which the major reason is the users can't afford. Major reasons for leads dropping out are because of affordibility issues.

In [None]:
df5.groupby([ 'snr_sm_id','jnr_sm_id']).size()

In [None]:
fig,ax=plt.subplots(figsize=(12,8))
ax=sns.histplot(data=df5["snr_sm_id"],bins=4,color="skyblue")
ax.set(title="Average distribution Juniour Sales Managers to Senior Sales Managers",ylabel="No of Junior Sales Managers",xlabel="Senior Managers");

Number of Junior Sales Managers are almost evenly assigned among Senior Sales Managers. Here we have total of 4 senior Managers

In [None]:
#percentage of leads acquired 

acquired_leads=L4.query('call_reason == "successful_conversion"')['lead_id'].nunique()
acquired_leads

In [None]:
total_no_of_leads=L4.lead_id.nunique()
total_no_of_leads

In [None]:
percent_of_leads_acquired=acquired_leads/total_no_of_leads*100
print(percent_of_leads_acquired)

In [None]:
no_of_leads_acquired=np.array([acquired_leads, total_no_of_leads-acquired_leads])
ig,ax=plt.subplots(figsize=(12,6))
ax.pie(no_of_leads_acquired,labels=["acquired_leads","total_no_of_leads"],autopct='%.2f',colors=["#D1D0CE", "#2B547E"],startangle=90)
ax.set_title("Percentage of Leads")

Approximately 18.08% (ie about 18% ) of customer acquisition is successfully done.