In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# Load and prepare data

df = pd.read_csv('data/course_lead_scoring.csv')

# Fill missing values
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].fillna('NA')
    else:
        df[column] = df[column].fillna(0.0)

In [3]:
# Question 1: Count values in 'industry'

print("Q1 - Value counts for 'industry':")
print(df["industry"].value_counts(), "\n")

Q1 - Value counts for 'industry':
industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64 



In [4]:
# Question 2: Find the most correlated numeric pair

numeric_columns = df.select_dtypes(include=['float64', 'int64']).drop(columns=['converted'])
correlation_matrix = numeric_columns.corr()

np.fill_diagonal(correlation_matrix.values, 0)

max_corr_value = correlation_matrix.abs().max().max()
max_corr_pair = correlation_matrix.stack().idxmax()

print("Q2 - Highest correlation:")
print(f"Pair: {max_corr_pair}")
print(f"Correlation: {max_corr_value:.3f}\n")

Q2 - Highest correlation:
Pair: ('annual_income', 'interaction_count')
Correlation: 0.027



In [5]:
# Split the data

X = df.drop(columns=['converted'])
y = df['converted']

# 80% train+val, 20% test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 60% train, 20% val
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}\n")

Training set size: 876
Validation set size: 293
Test set size: 293



In [6]:
# Question 3: Mutual information for categorical features

categorical = ['lead_source', 'industry', 'employment_status', 'location']

print("Q3 - Mutual Information Scores:")
for col in categorical:
    mi = mutual_info_score(X_train[col], y_train)
    print(f"{col:20s}: {mi:.2f}")
print()

Q3 - Mutual Information Scores:
lead_source         : 0.04
industry            : 0.01
employment_status   : 0.01
location            : 0.00



In [11]:
# Question 4: Model training (Logistic Regression)

numerical = [col for col in X_train.select_dtypes(include=['int64', 'float64']).columns]

features = categorical + numerical

dv = DictVectorizer(sparse=False)
train_dict = X_train[features].to_dict(orient='records')
val_dict = X_val[features].to_dict(orient='records')

X_train_encoded = dv.fit_transform(train_dict)
X_val_encoded = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

y_pred = model.predict(X_val_encoded)
val_acc = accuracy_score(y_val, y_pred)

print(f"Q4 - Validation Accuracy: {val_acc:.3f}\n")

Q4 - Validation Accuracy: 0.700



In [14]:
# Question 5: Feature elimination (Δaccuracy)

dv = DictVectorizer(sparse=False)
train_dict = X_train[features].to_dict(orient='records')
val_dict = X_val[features].to_dict(orient='records')

X_train_encoded = dv.fit_transform(train_dict)
X_val_encoded = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=10000, random_state=42)
model.fit(X_train_encoded, y_train)
y_pred = model.predict(X_val_encoded)

base_acc = accuracy_score(y_val, y_pred)
print(f"Q5 - Base accuracy: {base_acc:.4f}\n")

diffs = {}
for f in features:
    feats = [col for col in features if col != f]
    
    dv = DictVectorizer(sparse=False)
    train_dict = X_train[feats].to_dict(orient='records')
    val_dict = X_val[feats].to_dict(orient='records')
    
    X_train_enc = dv.fit_transform(train_dict)
    X_val_enc = dv.transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=10000, random_state=42)
    model.fit(X_train_enc, y_train)
    y_pred = model.predict(X_val_enc)
    
    acc = accuracy_score(y_val, y_pred)
    diffs[f] = abs(base_acc - acc)
    print(f"{f:20s}  Δaccuracy = {diffs[f]:.4f}")

least_useful = min(diffs, key=diffs.get)
print(f"\nLeast useful feature: {least_useful}\n")

Q5 - Base accuracy: 0.6997

lead_source           Δaccuracy = 0.0034
industry              Δaccuracy = 0.0000
employment_status     Δaccuracy = 0.0034
location              Δaccuracy = 0.0102
number_of_courses_viewed  Δaccuracy = 0.1433
annual_income         Δaccuracy = 0.1536
interaction_count     Δaccuracy = 0.1433
lead_score            Δaccuracy = 0.0068

Least useful feature: industry



In [13]:
# Question 6: Regularization parameter tuning (C)

C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=10000, random_state=42)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_val_encoded)
    acc = accuracy_score(y_val, y_pred)
    accuracies[C] = acc
    print(f"C={C:<6}  Validation Accuracy = {acc:.3f}")

best_C = max(accuracies, key=accuracies.get)
best_acc = accuracies[best_C]

print(f"\nQ6 - Best C: {best_C}  (Accuracy = {best_acc:.3f})")

C=0.01    Validation Accuracy = 0.700
C=0.1     Validation Accuracy = 0.700
C=1       Validation Accuracy = 0.700
C=10      Validation Accuracy = 0.700
C=100     Validation Accuracy = 0.700

Q6 - Best C: 0.01  (Accuracy = 0.700)
