In [2]:
!rm course_lead_scoring.csv

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-25 21:39:09--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-25 21:39:09 (57.3 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('course_lead_scoring.csv')

In [6]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [7]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
numerical = list(df.dtypes[df.dtypes != 'object'].index)

numerical.remove('converted')

In [8]:
numerical

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [9]:
# For categorical features, replace missing values with 'NA'
for col in categorical:
    df[col] = df[col].fillna('NA')

# For numerical features, replace with with 0.0
for col in numerical:
    df[col] = df[col].fillna(0.0)

In [10]:
# Check if all missing values are handled
print(df.isnull().sum())

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [11]:
# Find the mode of the 'industry' column
df['industry'].mode()[0]

'retail'

In [12]:
df[numerical].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [13]:
# Calculate the correlation matrix for all numerical features
correlation_matrix = df[numerical].corr()

# 3. Get the specific correlation values for the pairs in the question
pair_1 = correlation_matrix.loc['interaction_count', 'lead_score']
pair_2 = correlation_matrix.loc['number_of_courses_viewed', 'lead_score']
pair_3 = correlation_matrix.loc['number_of_courses_viewed', 'interaction_count']
pair_4 = correlation_matrix.loc['annual_income', 'interaction_count']

print("--- Correlations for Question 2 ---")
print(f"'interaction_count' and 'lead_score': {pair_1:.4f}")
print(f"'number_of_courses_viewed' and 'lead_score': {pair_2:.4f}")
print(f"'number_of_courses_viewed' and 'interaction_count': {pair_3:.4f}")
print(f"'annual_income' and 'interaction_count': {pair_4:.4f}")

--- Correlations for Question 2 ---
'interaction_count' and 'lead_score': 0.0099
'number_of_courses_viewed' and 'lead_score': -0.0049
'number_of_courses_viewed' and 'interaction_count': -0.0236
'annual_income' and 'interaction_count': 0.0270


In [14]:
from sklearn.model_selection import train_test_split

# 1. Define our features (X) and target (y)
df_features = df[categorical + numerical]
df_target = df['converted']

# 2. First split: 60% train_full, 20% test
# We set random_state=42 for reproducibility
X_train_full, X_test, y_train_full, y_test = train_test_split(df_features, df_target, 
                                                              test_size=0.2, 
                                                              random_state=42)

# 3. Second split: Split the 80% train_full set into 75/25 (60% train, 20% val)
# Note: 0.25 * 0.8 = 0.2 (which is 20% of the total)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, 
                                                  test_size=0.25, 
                                                  random_state=42)

# 4. Print the sizes of our final sets to confirm
print(f"Full dataset shape: {df_features.shape}")
print(f"Training set shape:   {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape:       {X_test.shape}")

Full dataset shape: (1462, 8)
Training set shape:   (876, 8)
Validation set shape: (293, 8)
Test set shape:       (293, 8)


In [15]:
from sklearn.metrics import mutual_info_score

# We will calculate MI only for our categorical features
# using only the training data
# MI score helps us know which features have more say on the outcome
print("--- Mutual Information Scores ---")
for col in categorical:
    score = mutual_info_score(X_train[col], y_train)
    print(f"'{col}': {round(score, 2)}")

--- Mutual Information Scores ---
'lead_source': 0.04
'industry': 0.01
'employment_status': 0.01
'location': 0.0


In [16]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

# 1. Create the DictVectorizer. sparse=False means it returns a standard NumPy array.
dv = DictVectorizer(sparse=False)

# 2. Convert our DataFrames to lists of dictionaries
# This is the format DictVectorizer expects
X_train_dicts = X_train.to_dict(orient='records')
X_val_dicts = X_val.to_dict(orient='records')

# 3. Fit and transform the training data
X_train_encoded = dv.fit_transform(X_train_dicts)

# 4. ONLY transform the validation data (using the rules learned from train)
X_val_encoded = dv.transform(X_val_dicts)

# 5. Initialize the model with homework parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# 6. Train the model
model.fit(X_train_encoded, y_train)

# 7. Calculate accuracy on the validation set
accuracy = model.score(X_val_encoded, y_val)

print(f"The accuracy on the validation dataset is: {round(accuracy, 2)}")

The accuracy on the validation dataset is: 0.7


In [17]:
# Our baseline accuracy from Question 4
original_accuracy = accuracy # This should be 0.6997... from your last run

# The list of features we need to test
features_to_test = ['industry', 'employment_status', 'lead_score']

print(f"Original Accuracy: {original_accuracy:.4f}\n")

# Loop through each feature to remove it
for feature in features_to_test:
    
    # --- 1. Create new datasets WITHOUT the feature ---
    # We drop the feature from our original DataFrames
    X_train_new = X_train.drop(columns=[feature])
    X_val_new = X_val.drop(columns=[feature])
    
    # --- 2. Re-encode the new data ---
    # We must use a NEW vectorizer to learn the new feature set
    dv_new = DictVectorizer(sparse=False)
    X_train_dicts_new = X_train_new.to_dict(orient='records')
    X_val_dicts_new = X_val_new.to_dict(orient='records')
    
    X_train_encoded_new = dv_new.fit_transform(X_train_dicts_new)
    X_val_encoded_new = dv_new.transform(X_val_dicts_new)
    
    # --- 3. Re-train a new model ---
    model_new = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_new.fit(X_train_encoded_new, y_train)
    
    # --- 4. Get the new accuracy and calculate the difference ---
    new_accuracy = model_new.score(X_val_encoded_new, y_val)
    difference = original_accuracy - new_accuracy
    
    print(f"Model without '{feature}':")
    print(f"  New Accuracy = {new_accuracy:.4f}")
    print(f"  Difference (Original - New) = {difference:.4f}\n")

Original Accuracy: 0.6997

Model without 'industry':
  New Accuracy = 0.6997
  Difference (Original - New) = 0.0000

Model without 'employment_status':
  New Accuracy = 0.6962
  Difference (Original - New) = 0.0034

Model without 'lead_score':
  New Accuracy = 0.7065
  Difference (Original - New) = -0.0068



In [18]:
# The list of C values to test
C_values = [0.01, 0.1, 1, 10, 100]

print("--- Tuning the 'C' parameter ---")

# Loop through each C value
for C in C_values:
    # 1. Initialize the model with the specific C value
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    
    # 2. Train the model
    # We use our original, full-feature training data
    model.fit(X_train_encoded, y_train)
    
    # 3. Calculate accuracy on the validation set
    accuracy = model.score(X_val_encoded, y_val)
    
    # 4. Print the result
    print(f"C = {C}: \t Accuracy = {round(accuracy, 4)}") # \t adds a nice tab for alignment

--- Tuning the 'C' parameter ---
C = 0.01: 	 Accuracy = 0.6997
C = 0.1: 	 Accuracy = 0.6997
C = 1: 	 Accuracy = 0.6997
C = 10: 	 Accuracy = 0.6997
C = 100: 	 Accuracy = 0.6997
