In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load data

data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [3]:
!wget $data wget

--2025-10-17 01:56:30--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.1’


2025-10-17 01:56:30 (53.7 MB/s) - ‘course_lead_scoring.csv.1’ saved [80876/80876]

--2025-10-17 01:56:30--  http://wget/
Resolving wget (wget)... failed: Name or service not known.
wget: unable to resolve host address ‘wget’
FINISHED --2025-10-17 01:56:30--
Total wall clock time: 0.07s
Downloaded: 1 files, 79K in 0.001s (53.7 MB/s)


In [4]:
df = pd.read_csv(data)

In [5]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


### Preparing the dataset

Use only the following columns:

'engine_displacement',
'horsepower',
'vehicle_weight',
'model_year',
'fuel_efficiency_mpg'


In [6]:
# Replace NaN with NA

df.isnull().sum()

Unnamed: 0,0
lead_source,128
industry,134
number_of_courses_viewed,0
annual_income,181
employment_status,100
location,63
interaction_count,0
lead_score,0
converted,0


In [7]:
df.dtypes

Unnamed: 0,0
lead_source,object
industry,object
number_of_courses_viewed,int64
annual_income,float64
employment_status,object
location,object
interaction_count,int64
lead_score,float64
converted,int64


In [8]:
cat_list = ['lead_source', 'industry',
       'employment_status', 'location']

num_list = ['number_of_courses_viewed', 'annual_income',
            'interaction_count', 'lead_score']

df[cat_list] = df[cat_list].fillna('NA')
df[num_list] = df[num_list].fillna(0)

In [9]:
df.isnull().sum()

Unnamed: 0,0
lead_source,0
industry,0
number_of_courses_viewed,0
annual_income,0
employment_status,0
location,0
interaction_count,0
lead_score,0
converted,0


### Question 1

What is the most frequent observation (mode) for the column industry?

- NA
- technology
- healthcare
- retail

In [10]:
df.industry.mode()

Unnamed: 0,industry
0,retail


### Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- interaction_count and lead_score
- number_of_courses_viewed and lead_score
- number_of_courses_viewed and interaction_count
- annual_income and interaction_count

In [11]:
df[num_list].corr() *100

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,100.0,0.977029,-2.356522,-0.4879
annual_income,0.977029,100.0,2.703647,1.560955
interaction_count,-2.356522,2.703647,100.0,0.988818
lead_score,-0.4879,1.560955,0.988818,100.0


### Split the data

- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value converted is not in your dataframe.

In [12]:
from sklearn.model_selection import train_test_split

help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)
    Split arrays or matrices into random train and test subsets.

    Quick utility that wraps input validation,
    ``next(ShuffleSplit().split(X, y))``, and application to input data
    into a single call for splitting (and optionally subsampling) data into a
    one-liner.

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.

    test_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        com

In [13]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [14]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

### Question 3

Calculate the mutual information score between converted and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?

- industry
- location
- lead_source
- employment_status

In [15]:
from sklearn.metrics import mutual_info_score, accuracy_score

def mutual_info_convert_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [16]:
mi = df_full_train[cat_list].apply(mutual_info_convert_score)
mi.sort_values(ascending=False)

Unnamed: 0,0
lead_source,0.024562
employment_status,0.01269
industry,0.008173
location,0.001212


### Question 4
Now let's train a logistic regression.

What accuracy did you get?

- 0.64
- 0.74
- 0.84
- 0.94

In [17]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train[cat_list + num_list].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[cat_list + num_list].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [18]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
model.fit(X_train, y_train)

In [19]:
y_pred = model.predict_proba(X_val)[:, 1]

converted_decision = (y_pred >= 0.5)

In [20]:
(y_val == converted_decision).mean().round(2)

np.float64(0.7)

### Question 5
- Let's find the least useful feature using the feature elimination technique.
- Train a model using the same features and parameters as in Q4 (without rounding).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?

- 'industry'
- 'employment_status'
- 'lead_score'

In [21]:
total_colums = df_train.columns.tolist()
total_colums

['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score']

In [22]:
original_accuracy = 0.7
# Now test removing each feature one by one
accuracy_differences = {}

for feature_to_remove in ['industry', 'employment_status', 'lead_score']:
    # Create feature set without the current feature
    features_without = [f for f in total_colums if f != feature_to_remove]

    # Train model without this feature
    train_dict = df_train[features_without].to_dict(orient='records')
    dv_without = DictVectorizer(sparse=False)
    X_train = dv_without.fit_transform(train_dict)

    val_dict = df_val[features_without].to_dict(orient='records')
    X_val = dv_without.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_pred >= 0.5)
    accuracy_without = (y_val == converted_decision).mean()

    # Calculate the difference
    difference = original_accuracy - accuracy_without
    accuracy_differences[feature_to_remove] = difference

    print(f"Without '{feature_to_remove}':")
    print(f"  Accuracy: {accuracy_without:.4f}")
    print(f"  Difference from original: {difference:.4f}\n")

Without 'industry':
  Accuracy: 0.6997
  Difference from original: 0.0003

Without 'employment_status':
  Accuracy: 0.7031
  Difference from original: -0.0031

Without 'lead_score':
  Accuracy: 0.6997
  Difference from original: 0.0003



### Question 6
- Now let's train a regularized logistic regression.
- Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
- Train models using all the features as in Q4.
- Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

In [23]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train[cat_list + num_list].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[cat_list + num_list].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [26]:
C = [0.01, 0.1, 1, 10, 100]
for c in C:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_pred >= 0.5)
    accuracy = (y_val == converted_decision).mean()
    rounded_accuracy = round(accuracy, 3)

    print(f"C = {c:5}: Accuracy = {rounded_accuracy}")

C =  0.01: Accuracy = 0.7
C =   0.1: Accuracy = 0.7
C =     1: Accuracy = 0.7
C =    10: Accuracy = 0.7
C =   100: Accuracy = 0.7
