### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

### Dataset

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


### Checking for any missing values

In [4]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

### Missing values handling

In [5]:
# For categorical -> 'NA'; numerical -> 0.0
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# exclude target if numeric
if 'converted' in num_cols:
    num_cols.remove('converted')

print("Categorical columns:", cat_cols)
print("Numerical columns:", num_cols)

# Filling missing values
df[cat_cols] = df[cat_cols].fillna('NA')
df[num_cols] = df[num_cols].fillna(0.0)

Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


### Question 1
most frequent observation (mode) for the column industry

In [6]:
# Q1: mode of 'industry'
industry_mode = df['industry'].mode(dropna=False)
print("\nQ1 - industry mode(s):", industry_mode.tolist())

# topmost mode value
industry_mode_value = industry_mode.iloc[0]
print("Q1 answer (mode):", industry_mode_value)


Q1 - industry mode(s): ['retail']
Q1 answer (mode): retail


### Question 2
correlation matrix for the numerical features(pairwise correlations)

In [7]:
#Correlation between numeric features
num_df = df.select_dtypes(include=[np.number])

#correlation matrix
corr = num_df.corr()

# correlations for the four candidate pairs
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

print("\nQ2 - Correlations for the candidate pairs:")
for a, b in pairs:
    if a in corr.columns and b in corr.columns:
        print(f"{a:30s} & {b:30s} : {corr.loc[a,b]:.4f}")
    else:
        print(f"{a} or {b} not found among numeric columns")

# The pair with the largest absolute correlation
best_pair = max(pairs, key=lambda x: abs(corr.loc[x[0], x[1]]))
best_value = corr.loc[best_pair[0], best_pair[1]]

print(f"\nQ2 answer → Strongest correlation is between {best_pair[0]} and {best_pair[1]} "
      f"({best_value:.4f})")



Q2 - Correlations for the candidate pairs:
interaction_count              & lead_score                     : 0.0099
number_of_courses_viewed       & lead_score                     : -0.0049
number_of_courses_viewed       & interaction_count              : -0.0236
annual_income                  & interaction_count              : 0.0270

Q2 answer → Strongest correlation is between annual_income and interaction_count (0.0270)


### Data Splitting

In [8]:
# Splitting the data (60/20/20)
# First we split train_temp (60%) and test_temp (40%); then split test_temp into val/test (each 20% of total)
X = df.drop(columns=['converted']).copy()
y = df['converted'].copy()

# First split: train 60%, temp 40%
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
# Second split: split temp into val/test equally (each 20% of total)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("\nData split sizes (rows):")
print("Train:", X_train.shape[0])
print("Val:  ", X_val.shape[0])
print("Test: ", X_test.shape[0])


Data split sizes (rows):
Train: 877
Val:   292
Test:  293


### Question 3

In [9]:
#  mutual information between 'converted' and categorical variables, using training set only.
# categorical variables for mutual information 
cat_cols_train = [c for c in X_train.columns if X_train[c].dtype == 'object' or str(X_train[c].dtype).startswith('category')]
print("\nCategorical columns used for MI:", cat_cols_train)

# Encode categories as integers for mutual_info_classif
# We'll further use pandas factorize per column to convert categories to ints
X_train_cat_encoded = pd.DataFrame()
for c in cat_cols_train:
    X_train_cat_encoded[c] = pd.factorize(X_train[c])[0]

# mutual information
mi = mutual_info_classif(X_train_cat_encoded.values, y_train.values, discrete_features=True, random_state=42)
mi_series = pd.Series(mi, index=cat_cols_train).sort_values(ascending=False)
mi_rounded = mi_series.round(2)
print("\nQ3 - Mutual information scores (training set only):")
print(mi_rounded)

# variables with the biggest MI
candidates_q3 = ['industry','location','lead_source','employment_status']
print("\nQ3 - candidate scores:")
for c in candidates_q3:
    val = mi_rounded.get(c, None)
    print(c, val)


Categorical columns used for MI: ['lead_source', 'industry', 'employment_status', 'location']

Q3 - Mutual information scores (training set only):
lead_source          0.03
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

Q3 - candidate scores:
industry 0.01
location 0.0
lead_source 0.03
employment_status 0.01


### Question 4: 
Train logistic regression with one-hot encoding for categoricals, on training set; evaluate on validation set.

In [10]:

# categorical and numerical columns
all_cat_cols = [c for c in X_train.columns 
                if X_train[c].dtype == 'object' or str(X_train[c].dtype).startswith('category')]
all_num_cols = [c for c in X_train.columns if c not in all_cat_cols]

# Preprocessing: OneHotEncoder for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), all_cat_cols)
    ],
    remainder='passthrough'
)

# Logistic Regression model
model = LogisticRegression(
    solver='liblinear', 
    C=1.0, 
    max_iter=1000, 
    random_state=42
)

# Combining preprocessing and model into a pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Train the model
pipe.fit(X_train, y_train)

# Predict on validation set
y_val_pred = pipe.predict(X_val)

# Evaluate accuracy
acc_val = accuracy_score(y_val, y_val_pred)
acc_val_rounded = round(acc_val, 2)

print("\nQ4 - Validation accuracy (rounded to 2 decimals):", acc_val_rounded)
print("Q4 - Validation accuracy (full):", acc_val)



Q4 - Validation accuracy (rounded to 2 decimals): 0.68
Q4 - Validation accuracy (full): 0.6815068493150684


### Question 5

In [11]:

# baseline validation accuracy from Q4
baseline_acc = acc_val
print(f"\nBaseline validation accuracy: {baseline_acc:.6f}")

# candidate features to test
candidates_q5 = ['industry', 'employment_status', 'lead_score']

# Function to train & evaluate without a specific feature
def train_and_eval_without_feature(X_tr, y_tr, X_v, y_v, drop_feature):
    features_subset = [f for f in X_tr.columns if f != drop_feature]
    cat_cols_sub = [c for c in features_subset if X_tr[c].dtype == 'object' or str(X_tr[c].dtype).startswith('category')]
    
    pre = ColumnTransformer(transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols_sub)
    ], remainder='passthrough')
    
    clf = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    p = Pipeline([('pre', pre), ('clf', clf)])
    p.fit(X_tr[features_subset], y_tr)
    
    yv_pred = p.predict(X_v[features_subset])
    return accuracy_score(y_v, yv_pred)

differences = {}
for feat in candidates_q5:
    acc_without = train_and_eval_without_feature(X_train, y_train, X_val, y_val, feat)
    diff = baseline_acc - acc_without
    differences[feat] = diff
    print(f"Removing '{feat}': val_acc={acc_without:.6f}, diff={diff:.6f}")

print("\nQ5 - Differences (baseline - without_feature):")
print(differences)

# The least useful feature (smallest difference)
least_useful = min(differences, key=differences.get)
print(f"\nQ5 answer → Least useful feature: '{least_useful}' (smallest difference)")



Baseline validation accuracy: 0.681507
Removing 'industry': val_acc=0.688356, diff=-0.006849
Removing 'employment_status': val_acc=0.681507, diff=0.000000
Removing 'lead_score': val_acc=0.674658, diff=0.006849

Q5 - Differences (baseline - without_feature):
{'industry': -0.006849315068493178, 'employment_status': 0.0, 'lead_score': 0.006849315068493067}

Q5 answer → Least useful feature: 'industry' (smallest difference)


### Question 6: 
Different regularization strengths (C values)

In [12]:
# Q6: different regularization strengths (C values)
Cs = [0.01, 0.1, 1, 10, 100]
val_accs = {}

for C in Cs:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    pipe = Pipeline([
        ('pre', preprocessor),  
        ('clf', model)
    ])
    pipe.fit(X_train, y_train)
    yv_pred = pipe.predict(X_val)
    acc = accuracy_score(y_val, yv_pred)
    val_accs[C] = acc
    print(f"C={C}: validation accuracy = {acc:.6f} (rounded to 3 decimals → {round(acc,3)})")

# best C (highest accuracy; if tie → smallest C)
best_acc = max(val_accs.values())
best_Cs = [C for C, acc in val_accs.items() if acc == best_acc]
chosen_C = min(best_Cs)

print("\nQ6 - Validation accuracies by C value:")
print(val_accs)
print(f"\nQ6 answer → Best C: {chosen_C} (validation accuracy = {best_acc:.6f})")


C=0.01: validation accuracy = 0.688356 (rounded to 3 decimals → 0.688)
C=0.1: validation accuracy = 0.681507 (rounded to 3 decimals → 0.682)
C=1: validation accuracy = 0.681507 (rounded to 3 decimals → 0.682)
C=10: validation accuracy = 0.681507 (rounded to 3 decimals → 0.682)
C=100: validation accuracy = 0.681507 (rounded to 3 decimals → 0.682)

Q6 - Validation accuracies by C value:
{0.01: 0.6883561643835616, 0.1: 0.6815068493150684, 1: 0.6815068493150684, 10: 0.6815068493150684, 100: 0.6815068493150684}

Q6 answer → Best C: 0.01 (validation accuracy = 0.688356)
