In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings('ignore')


url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (1462, 9)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [3]:
print("Missing values before processing:")
print(df.isnull().sum())

data = df.copy()

categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()


if 'converted' in numerical_cols:
    numerical_cols.remove('converted')

print(f"\nCategorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

for col in categorical_cols:
    data[col] = data[col].fillna('NA')

for col in numerical_cols:
    data[col] = data[col].fillna(0.0)

print("\nMissing values after processing:")
print(data.isnull().sum())

Missing values before processing:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

Missing values after processing:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [4]:
# Cell 3: Question 1 - Most frequent observation for industry
industry_mode = data['industry'].mode()[0]
print(f"Question 1: Most frequent observation for 'industry': {industry_mode}")

Question 1: Most frequent observation for 'industry': retail


In [5]:
correlation_matrix = data[numerical_cols].corr()
print("Correlation Matrix:")
print(correlation_matrix)

corr_pairs = []
for i in range(len(numerical_cols)):
    for j in range(i+1, len(numerical_cols)):
        col1, col2 = numerical_cols[i], numerical_cols[j]
        corr_value = correlation_matrix.loc[col1, col2]
        corr_pairs.append(((col1, col2), abs(corr_value)))

corr_pairs_sorted = sorted(corr_pairs, key=lambda x: x[1], reverse=True)

print("\nTop correlated pairs:")
for (col1, col2), corr_value in corr_pairs_sorted[:5]:
    print(f"{col1} & {col2}: {corr_value:.4f}")

specific_pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

print("\nSpecific pairs mentioned in question:")
for col1, col2 in specific_pairs:
    if col1 in numerical_cols and col2 in numerical_cols:
        corr_value = correlation_matrix.loc[col1, col2]
        print(f"{col1} & {col2}: {corr_value:.4f}")

Correlation Matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  

Top correlated pairs:
annual_income & interaction_count: 0.0270
number_of_courses_viewed & interaction_count: 0.0236
annual_income & lead_score: 0.0156
interaction_count & lead_score: 0.0099
number_of_courses_viewed & annual_income: 0.0098

Specific pairs mentioned in question:
interaction_count & lead_score: 0.0099
number_of_courses_viewed 

In [6]:
X = data.drop('converted', axis=1)
y = data['converted']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 877 samples
Validation set: 292 samples
Test set: 293 samples


In [10]:
from sklearn.preprocessing import LabelEncoder

categorical_features = X_train[categorical_cols].copy()

categorical_encoded = categorical_features.copy()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    categorical_encoded[col] = le.fit_transform(categorical_features[col].astype(str))
    label_encoders[col] = le

mi_scores = mutual_info_classif(categorical_encoded, y_train, random_state=42, discrete_features=True)
mi_results = pd.DataFrame({
    'feature': categorical_cols,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

print("Mutual Information Scores (rounded to 2 decimals):")
mi_results['mi_score_rounded'] = mi_results['mi_score'].round(2)
print(mi_results[['feature', 'mi_score_rounded']])

max_mi_feature = mi_results.iloc[0]['feature']
max_mi_score = mi_results.iloc[0]['mi_score_rounded']
print(f"\nQuestion 3: Variable with biggest mutual information score: {max_mi_feature} (score: {max_mi_score})")

Mutual Information Scores (rounded to 2 decimals):
             feature  mi_score_rounded
0        lead_source              0.03
2  employment_status              0.02
1           industry              0.02
3           location              0.00

Question 3: Variable with biggest mutual information score: lead_source (score: 0.03)


In [11]:
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols)
X_val_encoded = pd.get_dummies(X_val, columns=categorical_cols)

X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

print(f"Training features after encoding: {X_train_encoded.shape}")
print(f"Validation features after encoding: {X_val_encoded.shape}")

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

y_val_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_val_pred)

print(f"\nQuestion 4: Accuracy on validation set: {accuracy:.2f}")

Training features after encoding: (877, 31)
Validation features after encoding: (292, 31)

Question 4: Accuracy on validation set: 0.74


In [12]:

original_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Original accuracy: {original_accuracy:.4f}")
features_to_test = ['industry', 'employment_status', 'lead_score']
accuracy_differences = {}

for feature in features_to_test:
    if feature in categorical_cols:

        cols_to_remove = [col for col in X_train_encoded.columns if col.startswith(feature + '_')]
        X_train_reduced = X_train_encoded.drop(cols_to_remove, axis=1)
        X_val_reduced = X_val_encoded.drop(cols_to_remove, axis=1)
    else:

        X_train_reduced = X_train_encoded.drop(feature, axis=1)
        X_val_reduced = X_val_encoded.drop(feature, axis=1)
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)

    y_val_pred_reduced = model_reduced.predict(X_val_reduced)
    accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)

    difference = original_accuracy - accuracy_reduced
    accuracy_differences[feature] = difference
    
    print(f"Without {feature}: Accuracy = {accuracy_reduced:.4f}, Difference = {difference:.4f}")

smallest_diff_feature = min(accuracy_differences.items(), key=lambda x: abs(x[1]))
print(f"\nQuestion 5: Feature with smallest difference: {smallest_diff_feature[0]} (difference: {smallest_diff_feature[1]:.4f})")

Original accuracy: 0.7432
Without industry: Accuracy = 0.7432, Difference = 0.0000
Without employment_status: Accuracy = 0.7466, Difference = -0.0034
Without lead_score: Accuracy = 0.7432, Difference = 0.0000

Question 5: Feature with smallest difference: industry (difference: 0.0000)


In [13]:
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

print("Regularized Logistic Regression Results:")
for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train_encoded, y_train)
    
    y_val_pred_reg = model_reg.predict(X_val_encoded)
    accuracy_reg = accuracy_score(y_val, y_val_pred_reg)
    
    print(f"C = {C}: Accuracy = {accuracy_reg:.3f}")
    
    if accuracy_reg > best_accuracy:
        best_accuracy = accuracy_reg
        best_C = C

print(f"\nQuestion 6: Best C value: {best_C} (accuracy: {best_accuracy:.3f})")


Regularized Logistic Regression Results:
C = 0.01: Accuracy = 0.743
C = 0.1: Accuracy = 0.743
C = 1: Accuracy = 0.743
C = 10: Accuracy = 0.743
C = 100: Accuracy = 0.743

Question 6: Best C value: 0.01 (accuracy: 0.743)
