<a href="https://colab.research.google.com/github/Aderonke25/Machine-Learning-Zoomcamp-2025/blob/main/DataTalksModule3(Classification).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import Ridge
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

In [3]:
df = pd.read_csv(url)

In [4]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Replace missing values:
# Categorical features → 'NA'
# Numerical features → 0.0
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)

# Verify no missing values remain
print("\nMissing values after replacement:")
print(df.isnull().sum())

Missing values per column:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

Missing values after replacement:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


#Question 1


In [6]:
# Find the mode (most frequent value) for the 'industry' column
mode_value = df['industry'].mode()[0]

print("Most frequent observation (mode) for 'industry':", mode_value)

Most frequent observation (mode) for 'industry': retail


#Question 2

In [7]:
corr_matrix = df.corr(numeric_only=True)
print(corr_matrix)


                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  


#Question 3

In [8]:
# Separate features (X) and target (y)
X = df.drop(columns=['converted'])
y = df['converted']

# First split: 60% train, 40% temp (val + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

# Second split: 20% val, 20% test (split the remaining 40%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# Print shapes to confirm the split
print("Train set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

Train set: (877, 8) (877,)
Validation set: (292, 8) (292,)
Test set: (293, 8) (293,)


In [9]:
# Select only categorical features from the training set
categorical_features = X_train.select_dtypes(include=['object']).columns
X_train_cat = X_train[categorical_features]

# Convert categorical variables to numeric using label encoding
from sklearn.preprocessing import LabelEncoder

X_train_encoded = X_train_cat.copy()
for col in X_train_encoded.columns:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train_encoded[col])

# Compute mutual information between categorical features and target
mi_scores = mutual_info_classif(X_train_encoded, y_train, random_state=42)

# Create a DataFrame for better display
mi_df = pd.DataFrame({
    'Feature': categorical_features,
    'Mutual Information Score': [round(score, 2) for score in mi_scores]
}).sort_values(by='Mutual Information Score', ascending=False)

print(mi_df)

             Feature  Mutual Information Score
0        lead_source                      0.04
1           industry                      0.03
2  employment_status                      0.02
3           location                      0.02


#Question 4

In [11]:
# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numeric_cols = X_train.select_dtypes(exclude=['object']).columns

# Preprocess: One-hot encode categorical features, pass numerical as-is
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # keep numerical features unchanged
)

# Define the logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Build the pipeline (preprocessing + model)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                     ('model', model)])

# Fit on the training data
clf.fit(X_train, y_train)

# Predict on the validation data
y_pred = clf.predict(X_val)

# Calculate accuracy
val_accuracy = accuracy_score(y_val, y_pred)
val_accuracy = round(val_accuracy, 2)

print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.74


#Question 5

In [12]:
# Step 1: Train the baseline model using all features
clf.fit(X_train, y_train)
base_acc = accuracy_score(y_val, clf.predict(X_val))
print("Base accuracy:", base_acc)

# Step 2: Evaluate by removing each feature
feature_diffs = {}

for feature in X_train.columns:
    # Drop one feature
    X_train_reduced = X_train.drop(columns=[feature])
    X_val_reduced = X_val.drop(columns=[feature])

    # Define same preprocessing and model
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'),
             X_train_reduced.select_dtypes(include=['object']).columns)
        ],
        remainder='passthrough'
    )

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    clf_reduced = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

    # Fit and evaluate
    clf_reduced.fit(X_train_reduced, y_train)
    acc = accuracy_score(y_val, clf_reduced.predict(X_val_reduced))

    # Record difference
    feature_diffs[feature] = base_acc - acc

# Step 3: Sort features by smallest difference
sorted_diffs = sorted(feature_diffs.items(), key=lambda x: x[1])
for f, d in sorted_diffs:
    print(f"{f}: {d:.4f}")


Base accuracy: 0.7431506849315068
annual_income: -0.1130
employment_status: -0.0034
industry: 0.0000
location: 0.0000
lead_score: 0.0000
lead_source: 0.0137
number_of_courses_viewed: 0.0651
interaction_count: 0.0685


#Question 6


In [13]:
# List of C values to test
C_values = [0.01, 0.1, 1, 10, 100]

# Store results
results = []

for C in C_values:
    # Define model with the same preprocessing pipeline as before
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Train model
    clf.fit(X_train, y_train)

    # Predict and compute accuracy on validation set
    y_pred = clf.predict(X_val)
    acc = accuracy_score(y_val, y_pred)

    # Round accuracy to 3 decimal digits
    results.append((C, round(acc, 3)))

# Display results
for c, acc in results:
    print(f"C={c}: Validation Accuracy={acc}")

C=0.01: Validation Accuracy=0.743
C=0.1: Validation Accuracy=0.743
C=1: Validation Accuracy=0.743
C=10: Validation Accuracy=0.743
C=100: Validation Accuracy=0.743
