In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [38]:
# Q1 What is the most frequent observation (mode) for the column industry?
# Replace missing values

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)

        # Find the most frequent observation (mode) for 'industry'
mode_industry = df['industry'].mode()[0]
print(mode_industry)

retail


In [39]:
# Q2 Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.
# What are the two features that have the biggest correlation?

# Compute correlation matrix for numerical features only
corr_matrix = df.corr(numeric_only=True)

# Display the matrix
print("Correlation matrix:")
print(corr_matrix)

Correlation matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  


In [40]:
# Calculate correlations for the specified pairs
pairs = {
    'interaction_count and lead_score': abs(corr_matrix.loc['interaction_count', 'lead_score']),
    'number_of_courses_viewed and lead_score': abs(corr_matrix.loc['number_of_courses_viewed', 'lead_score']),
    'number_of_courses_viewed and interaction_count': abs(corr_matrix.loc['number_of_courses_viewed', 'interaction_count']),
    'annual_income and interaction_count': abs(corr_matrix.loc['annual_income', 'interaction_count'])
}

# Print all correlations
print("\nSelected Pairs Correlation:")
for pair, corr_value in pairs.items():
    print(f"{pair}: {corr_value:.4f}")

# Find the pair with the largest correlation
max_pair = max(pairs, key=pairs.get)
max_pair


Selected Pairs Correlation:
interaction_count and lead_score: 0.0099
number_of_courses_viewed and lead_score: 0.0049
number_of_courses_viewed and interaction_count: 0.0236
annual_income and interaction_count: 0.0270


'annual_income and interaction_count'

In [41]:
# Separate features and target
df_full = df.copy()
y = df_full['converted']
X = df_full.drop(columns=['converted'])

# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Check sizes
len(X_train), len(X_val), len(X_test)

(877, 292, 293)

In [42]:
# Q3 Calculate the mutual information score between converted and other categorical variables in the dataset. Use the training set only.
# Round the scores to 2 decimals using round(score, 2).

# Select categorical columns
cat_cols = X_train.select_dtypes(include=['object']).columns

# Encode categorical features into numeric codes (label encoding)
X_train_enc = X_train[cat_cols].apply(lambda x: x.astype('category').cat.codes)

# Compute mutual information between each categorical variable and target
mi_scores = mutual_info_classif(X_train_enc, y_train, discrete_features=True, random_state=42)

# Create DataFrame for better readability
mi_df = pd.DataFrame({'Feature': cat_cols, 'MI_Score': mi_scores})
mi_df['MI_Score'] = mi_df['MI_Score'].round(2)

# Display results
print(mi_df.sort_values(by='MI_Score', ascending=False))

             Feature  MI_Score
0        lead_source      0.03
1           industry      0.02
2  employment_status      0.02
3           location      0.00


In [43]:
# Q4 Now let's train a logistic regression.
# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
# What accuracy did you get?

# Convert categorical + numerical data into dictionary format
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')

# One-hot encoding
dv = DictVectorizer(sparse=False)
X_train_enc = dv.fit_transform(train_dicts)
X_val_enc = dv.transform(val_dicts)


In [44]:
# Define and fit model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)

# Predict on validation data
y_pred = model.predict(X_val_enc)

# Evaluate accuracy
val_accuracy = accuracy_score(y_val, y_pred)
print(round(val_accuracy, 2))

0.74


In [45]:
# Q5 Let's find the least useful feature using the feature elimination technique.
# Train a model using the same features and parameters as in Q4 (without rounding).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

features_to_check = ['industry', 'employment_status', 'lead_score']
diffs = {}

for feature in features_to_check:
    # Drop one feature
    X_train_drop = X_train.drop(columns=[feature])
    X_val_drop = X_val.drop(columns=[feature])
    
    # Re-encode and train again
    dv_drop = DictVectorizer(sparse=False)
    X_train_enc_drop = dv_drop.fit_transform(X_train_drop.to_dict(orient='records'))
    X_val_enc_drop = dv_drop.transform(X_val_drop.to_dict(orient='records'))
    
    model_drop = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_drop.fit(X_train_enc_drop, y_train)
    
    # Compute accuracy without this feature
    acc_drop = accuracy_score(y_val, model_drop.predict(X_val_enc_drop))
    
    # Store difference
    diffs[feature] = base_acc - acc_drop

# Display results
for f, d in diffs.items():
    print(f"{f}: {d:.4f}")

industry: 0.0000
employment_status: -0.0034
lead_score: 0.0000


In [46]:
#Q6 Now let's train a regularized logistic regression.
# Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
# Train models using all the features as in Q4.
# Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
# Which of these C leads to the best accuracy on the validation set?

C_values = [0.01, 0.1, 1, 10, 100]
acc_scores = {}

for c in C_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train_enc, y_train)
    y_pred = model.predict(X_val_enc)
    acc = accuracy_score(y_val, y_pred)
    acc_scores[c] = round(acc, 3)

acc_scores

{0.01: 0.743, 0.1: 0.743, 1: 0.743, 10: 0.743, 100: 0.743}