In [1]:
import pandas as pd
df=pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [2]:
df.sample(5)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
168,referral,finance,2,67321.0,employed,south_america,3,0.02,1
1255,social_media,technology,2,43833.0,unemployed,middle_east,3,0.28,0
1399,paid_ads,finance,3,70455.0,student,europe,3,0.23,1
898,social_media,healthcare,1,54955.0,employed,north_america,5,0.16,0
227,events,technology,2,53037.0,self_employed,,0,0.15,0


In [3]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [5]:
string_cols = df.select_dtypes(include=['object']).columns
df[string_cols]=df[string_cols].fillna("NA")

In [6]:
df["annual_income"]=df["annual_income"].fillna(0.0)

In [7]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [8]:
#Q1 What is the most frequent observation (mode) for the column industry?
df["industry"].mode()

0    retail
Name: industry, dtype: object

In [9]:
#Q2 Question Create the correlation matrix for the numerical features of your dataset. In a correlation matrix,What are the two features that have the biggest correlation?
numerical_col=df.select_dtypes(include=["number"])
corr_matrix=numerical_col.corr()
corr_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [10]:
from sklearn.model_selection import train_test_split
X=df.drop(columns=["converted"])
y=df["converted"]

In [11]:
X

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94
1,social_media,retail,1,46992.0,employed,south_america,1,0.80
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69
3,paid_ads,retail,2,83843.0,,australia,1,0.87
4,referral,education,3,85012.0,self_employed,europe,3,0.62
...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53
1458,referral,technology,3,65259.0,student,europe,2,0.24
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02
1460,referral,,5,71016.0,self_employed,north_america,0,0.25


In [12]:
X.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
dtype: object

In [13]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [14]:
#Q4
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import mutual_info_classif

categorical_cols = ['lead_source', 'industry', 'employment_status', 'location']
X_train_sub = X_train[categorical_cols].copy()
y_train_sub = y_train.copy()

# encode categorical features into integers
encoder = OrdinalEncoder(dtype=int)
X_encoded = encoder.fit_transform(X_train_sub)

# compute mutual information
mi_scores = mutual_info_classif(X_encoded, y_train_sub, discrete_features=True, random_state=42)

# round to 2 decimals
mi_scores_rounded = [round(score, 2) for score in mi_scores]

# pair results
mi_results = dict(zip(categorical_cols, mi_scores_rounded))

# display
print("Mutual Information Scores:")
for col, score in mi_results.items():
    print(f"{col}: {score}")

# find highest MI column
best_feature = max(mi_results, key=mi_results.get)
print(f"\n The feature with the highest mutual information is '{best_feature}' "
      f"with a score of {mi_results[best_feature]}")

Mutual Information Scores:
lead_source: 0.03
industry: 0.02
employment_status: 0.02
location: 0.0

 The feature with the highest mutual information is 'lead_source' with a score of 0.03


In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define categorical and numerical columns
categorical_cols = ['lead_source', 'industry', 'employment_status', 'location']
numeric_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough')

# Model Definition 
model = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))])

# Train 
model.fit(X_train, y_train)

# Validate 
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)

print("Validation accuracy (rounded to 2 decimals):", round(val_accuracy, 2))

Validation accuracy (rounded to 2 decimals): 0.74


In [16]:
#Q5
original_accuracy = accuracy_score(y_val, y_val_pred)
print("Original accuracy:", original_accuracy)

Original accuracy: 0.7431506849315068


In [17]:
def accuracy_without_feature(feature_to_remove):
    X_temp = X.drop(columns=[feature_to_remove])
    
    # Update categorical and numeric columns dynamically
    cat_temp = [col for col in categorical_cols if col != feature_to_remove]
    num_temp = [col for col in numeric_cols if col != feature_to_remove]
    
    # Redefine preprocessor and model
    preproc_temp = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_temp)],
        remainder='passthrough')
    
    model_temp = Pipeline([
        ('preproc', preproc_temp),
        ('clf', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))])
    
    X_train_t, X_val_t, y_train_t, y_val_t = train_test_split(X_temp, y, test_size=0.25, random_state=42)
    model_temp.fit(X_train_t, y_train_t)
    y_pred_t = model_temp.predict(X_val_t)
    
    return accuracy_score(y_val_t, y_pred_t)

In [18]:
features_to_test = ['industry', 'employment_status', 'lead_score']
accuracy_diffs = {}

for f in features_to_test:
    acc = accuracy_without_feature(f)
    diff = acc - original_accuracy
    accuracy_diffs[f] = diff
    print(f"{f}: Accuracy without feature = {acc:.4f}, Difference = {diff:.4f}")

industry: Accuracy without feature = 0.7377, Difference = -0.0054
employment_status: Accuracy without feature = 0.7268, Difference = -0.0164
lead_score: Accuracy without feature = 0.7377, Difference = -0.0054


In [19]:
least_useful = min(accuracy_diffs, key=lambda k: abs(accuracy_diffs[k]))
print("\nLeast useful feature:", least_useful)


Least useful feature: industry


In [20]:
#Q6
# Try different C values 
C_values = [0.01, 0.1, 1, 10, 100]
results = {}

for c in C_values:
    model = Pipeline([
        ('preproc', preprocessor),
        ('clf', LogisticRegression(
            solver='liblinear', 
            C=c, 
            max_iter=1000, 
            random_state=42
        ))
    ])
    
    # Train
    model.fit(X_train, y_train)
    
    # Validate
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    results[c] = round(acc, 3)
    print(f"C={c}: Validation accuracy = {results[c]}")

# Find best C 
best_c = max(results, key=lambda c: (results[c], -c))
print("\nBest C:", best_c)

C=0.01: Validation accuracy = 0.743
C=0.1: Validation accuracy = 0.743
C=1: Validation accuracy = 0.743
C=10: Validation accuracy = 0.743
C=100: Validation accuracy = 0.743

Best C: 0.01
