In [80]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [81]:
df = pd.read_csv('course_lead_scoring.csv')

In [82]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [83]:
len(df)

1462

In [84]:
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [85]:
df.isnull().any()

lead_source                  True
industry                     True
number_of_courses_viewed    False
annual_income                True
employment_status            True
location                     True
interaction_count           False
lead_score                  False
converted                   False
dtype: bool

In [86]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [87]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [88]:
df['industry'].unique()

array([nan, 'retail', 'healthcare', 'education', 'manufacturing',
       'technology', 'other', 'finance'], dtype=object)

In [89]:
for n in categorical:
    df[n] = df[n].fillna('NA')

In [90]:
for n in numerical:
    df[n] = df[n].fillna(0.0)

In [91]:
df['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [92]:
# Question 1 
# What is the most frequent observation (mode) for the column industry?

In [93]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [94]:
# Question 2 
# Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.
# What are the two features that have the biggest correlation?

In [95]:
df[['interaction_count', 'number_of_courses_viewed' ]].corrwith(df.lead_score).to_frame('correlation')

Unnamed: 0,correlation
interaction_count,0.009888
number_of_courses_viewed,-0.004879


In [96]:
df[['number_of_courses_viewed', 'annual_income']].corrwith(df.interaction_count).to_frame('correlation')

Unnamed: 0,correlation
number_of_courses_viewed,-0.023565
annual_income,0.027036


In [97]:
# Split the data
# Split your data in train/val/test sets with 60%/20%/20% distribution.
# Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
# Make sure that the target value y is not in your dataframe.

In [98]:
from sklearn.model_selection import train_test_split

In [99]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [100]:
len(df_train_full), len(df_test)

(1169, 293)

In [101]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [102]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [103]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [105]:
y_train = df_train.converted.values
y_val = df_val.converted.values

In [None]:
# Question 3
# Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
# Round the scores to 2 decimals using round(score, 2).

In [106]:
from sklearn.metrics import mutual_info_score

In [107]:
def calculate_mi(series):
    return mutual_info_score(series, df_train.converted)

df_mi = df_train[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())

Unnamed: 0,MI
lead_source,0.035396
employment_status,0.012938
industry,0.011575
location,0.004464


In [108]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [109]:
# Question 4
# Now let's train a logistic regression.
# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [110]:
from sklearn.feature_extraction import DictVectorizer

In [111]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [112]:
train_dict[0]

{'lead_source': 'paid_ads',
 'industry': 'retail',
 'employment_status': 'student',
 'location': 'middle_east',
 'number_of_courses_viewed': 0,
 'annual_income': 58472.0,
 'interaction_count': 5,
 'lead_score': 0.03}

In [113]:
dv = DictVectorizer(sparse=False)
#When initializing a DictVectorizer in scikit-learn, 
#setting the sparse parameter to False instructs the vectorizer to return a dense NumPy array as output, 
#rather than a sparse SciPy matrix.

In [114]:
dv.fit(train_dict)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [115]:
# The categorical values are hot encoded to 0 or 1
# The numerical values will remain intact
# dv.fit_transform combines fit and transform

# Feature matrix for train data
X_train = dv.transform(train_dict)

In [116]:
X_train.shape

(876, 31)

In [117]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [118]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')

In [119]:
# Feature matrix for validation data
X_val = dv.transform(val_dict)

In [120]:
from sklearn.linear_model import LogisticRegression

In [121]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [122]:
y_pred = model.predict_proba(X_val)[:, 1]

In [123]:
convert_decision = (y_pred >= 0.5).astype(int)

In [124]:
(y_val == convert_decision).mean().round(2)

np.float64(0.7)

In [125]:
from sklearn.metrics import accuracy_score

In [126]:
accuracy = accuracy_score(y_val, convert_decision)

In [127]:
accuracy

0.6996587030716723

In [128]:
# Question 5
# Let's find the least useful feature using the feature elimination technique.
# Train a model using the same features and parameters as in Q4 (without rounding).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
# Which of following feature has the smallest difference?

# 'industry'
# 'employment_status'
# 'lead_score'
# Note: The difference doesn't have to be positive.

In [131]:
accuracy_with_all_features = (y_val == convert_decision).mean()
accuracy_with_all_features

np.float64(0.6996587030716723)

In [133]:
def calc_accuracy_without_feature(feat):
    
    df_train_without_feature = df_train.loc[:, df_train.columns != feat]
    df_val_without_feature   = df_val.loc[:, df_val.columns != feat]

    
    train_dict_without_feature = df_train_without_feature.to_dict(orient='records')
    dv.fit(train_dict_without_feature) 
    X_train_without_feature = dv.transform(train_dict_without_feature)

    val_dict_without_feature = df_val_without_feature.to_dict(orient='records')
    X_val_without_feature = dv.transform(val_dict_without_feature)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_without_feature, y_train)

    y_pred_without_feature = model.predict_proba(X_val_without_feature)[:, 1]
    convert_decision = (y_pred_without_feature >= 0.5).astype(int)
    
    return accuracy_score(y_val, convert_decision)






In [135]:
features_to_be_removed = ['industry', 'employment_status', 'lead_score']
results = {}

for feature in features_to_be_removed:
    accuracy_without_feature = calc_accuracy_without_feature(feature)
    difference = accuracy_with_all_features - accuracy_without_feature
    results[feature] = {
        'accuracy': accuracy_without_feature,
        'difference': difference
    }
    print(f"Without {feature}: Accuracy = {accuracy_without_feature:.6f}, Difference = {difference:.6f}")

# Find the feature with the smallest difference
min_feature = min(results, key=lambda x: abs(results[x]['difference']))
print(f"\nFeature with smallest impact: {min_feature}, Difference: {results[min_feature]['difference']:.6f}")


Without industry: Accuracy = 0.699659, Difference = 0.000000
Without employment_status: Accuracy = 0.696246, Difference = 0.003413
Without lead_score: Accuracy = 0.706485, Difference = -0.006826

Feature with smallest impact: industry, Difference: 0.000000
