In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-13 18:10:51--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.4’


2025-10-13 18:10:51 (35.2 MB/s) - ‘course_lead_scoring.csv.4’ saved [80876/80876]



In [2]:
import pandas as pd

## Data preparation

In [3]:
df = pd.read_csv("course_lead_scoring.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [5]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [6]:
target_col = "converted"
numerical_cols = []
categorical_cols = []
for col in df.columns:
    if col != target_col:
        if df[col].dtype == "object":
            categorical_cols.append(col)
    
        else:
            numerical_cols.append(col)


In [7]:
numerical_cols

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [8]:
categorical_cols

['lead_source', 'industry', 'employment_status', 'location']

## Fill missing values

In [9]:
for col in df.columns:
    if col in categorical_cols:
        df[col] = df[col].fillna("NA")
    elif col in numerical_cols:
        df[col] = df[col].fillna(0)

In [10]:
df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

## Question 1

In [11]:
df["industry"].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [12]:
df["industry"].mode()

0    retail
Name: industry, dtype: object

Answer: retail

## Question 2

In [13]:
corr_pairs = zip(
    ["interaction_count", "number_of_courses_viewed", "number_of_courses_viewed", "annual_income"],
    ["lead_score", "lead_score", "interaction_count", "interaction_count"]
)
print("Correlations:")
max_corr = 0
feat1_max_corr = ""
feat2_max_corr = ""
for feat1, feat2 in corr_pairs:
    corr = abs(df[feat1].corr(df[feat2]))
    print(f" - {feat1} and {feat2}: {corr:.3f}")
    if corr > max_corr:
        max_corr = corr
        feat1_max_corr = feat1
        feat2_max_corr = feat2

print(f"\nAnswer: the two features with the biggest correlation are {feat1_max_corr} and {feat2_max_corr}.")

Correlations:
 - interaction_count and lead_score: 0.010
 - number_of_courses_viewed and lead_score: 0.005
 - number_of_courses_viewed and interaction_count: 0.024
 - annual_income and interaction_count: 0.027

Answer: the two features with the biggest correlation are annual_income and interaction_count.


## Split the data

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.2/0.8, random_state=42)
len(df_train) + len(df_val) + len(df_test) == len(df)

True

In [16]:
y_full_train = df_full_train[target_col].copy()
y_train = df_train[target_col].copy()
y_val = df_val[target_col].copy()
y_test = df_test[target_col].copy()

In [17]:
del df_full_train[target_col]
del df_train[target_col]
del df_val[target_col]
del df_test[target_col]

## Question 3

In [18]:
from sklearn.metrics import mutual_info_score

In [19]:
def mutual_info_score_(series):
    return mutual_info_score(series, y_train)

In [20]:
mi = df_train[categorical_cols].apply(mutual_info_score_)
mi.sort_values(ascending=False)

lead_source          0.035396
employment_status    0.012938
industry             0.011575
location             0.004464
dtype: float64

## Question 4

In [21]:
from sklearn.feature_extraction import DictVectorizer

In [22]:
dv = DictVectorizer()
dicts_train = df_train.to_dict(orient="records")
X_train = dv.fit_transform(dicts_train)
val_dicts = df_val.to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
model_accuracy = (y_val == y_pred).mean()
print(f"Model accuracy: {model_accuracy.round(2)}")

Model accuracy: 0.7


## Question 5

In [25]:
features_to_eliminate = ["industry", "employment_status", "lead_score"]
accuracy_diffs = dict.fromkeys(features_to_eliminate)
for feat in features_to_eliminate:
    cols_to_keep = df_train.columns.to_list()
    cols_to_keep.remove(feat)
    dicts_train = df_train[cols_to_keep].to_dict(orient="records")
    dv = DictVectorizer()
    X_train = dv.fit_transform(dicts_train)
    val_dicts = df_val[cols_to_keep].to_dict(orient="records")
    X_val = dv.transform(val_dicts)
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    new_model_accuracy = (y_val == y_pred).mean()
    print(f"Model accuracy without the feature '{feat}': {new_model_accuracy.round(4)}")
    accuracy_diffs[feat] = abs(model_accuracy - new_model_accuracy)
    print(f"Absolute difference in model accuracy: {accuracy_diffs[feat].round(4)}")

print(f"\nFeature with smallest accuracy difference: {min(accuracy_diffs, key=accuracy_diffs.get)}")

Model accuracy without the feature 'industry': 0.6997
Absolute difference in model accuracy: 0.0
Model accuracy without the feature 'employment_status': 0.6962
Absolute difference in model accuracy: 0.0034
Model accuracy without the feature 'lead_score': 0.7065
Absolute difference in model accuracy: 0.0068

Feature with smallest accuracy difference: industry


## Question 6

In [26]:
dv = DictVectorizer()
dicts_train = df_train.to_dict(orient="records")
X_train = dv.fit_transform(dicts_train)
val_dicts = df_val.to_dict(orient="records")
X_val = dv.transform(val_dicts)
model_accuracies = {}
for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    model_accuracies[C] = (y_val == y_pred).mean().round(3)

print(f"\nC with best accuracy: {max(model_accuracies, key=model_accuracies.get)}")


C with best accuracy: 0.01
