In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-15 14:03:42--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-15 14:03:42 (3.61 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [20]:
import pandas as pd

# Read the data from the CSV file
data = pd.read_csv('course_lead_scoring.csv')

# Display the first few rows of the dataframe
data.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [2]:
data['industry'].mode()[0]

'retail'

In [6]:
# Select only numerical columns
numerical_data = data.select_dtypes(include=['number', 'float64', 'int64'])

# Compute the correlation matrix for numerical features
correlation_matrix = numerical_data.corr()

# Extract the correlation values for the specified pairs
pairs = {
    "interaction_count and lead_score": correlation_matrix.loc["interaction_count", "lead_score"],
    "number_of_courses_viewed and lead_score": correlation_matrix.loc["number_of_courses_viewed", "lead_score"],
    "number_of_courses_viewed and interaction_count": correlation_matrix.loc["number_of_courses_viewed", "interaction_count"],
    "annual_income and interaction_count": correlation_matrix.loc["annual_income", "interaction_count"]
}

# Find the pair with the highest correlation
max_pair = max(pairs, key=pairs.get)
max_pair

'annual_income and interaction_count'

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

# Split the data into train, validation, and test sets (60%/20%/20%)
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Calculate mutual information scores for categorical variables in the training set
categorical_columns = ['industry', 'location', 'lead_source', 'employment_status']
mutual_info_scores = {}

for col in categorical_columns:
    mutual_info_scores[col] = round(mutual_info_score(train_data['converted'], train_data[col].astype(str)), 2)

# Find the variable with the highest mutual information score
max_mutual_info_var = max(mutual_info_scores, key=mutual_info_scores.get)
max_mutual_info_var, mutual_info_scores[max_mutual_info_var]

('lead_source', 0.03)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Create a pipeline with preprocessing and logistic regression
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
X_train = train_data.drop(columns=['converted'])


y_train = train_data['converted']
X_val = val_data.drop(columns=['converted'])
y_val = val_data['converted']

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the validation data
y_pred = pipeline.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
accuracy



0.6267123287671232