In [1]:
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn info:")
print(df.info())

Dataset shape: (1462, 9)

First few rows:
    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  

Column info:
<class 'pandas.core.frame.DataFrame'>

In [2]:
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [3]:

categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=[np.number]).columns

print("Categorical columns:", list(categorical_cols))
print("Numerical columns:", list(numerical_cols))

for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        print(f"Filling missing values in categorical column '{col}' with 'NA'")
        df[col] = df[col].fillna('NA')

for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        print(f"Filling missing values in numerical column '{col}' with 0.0")
        df[col] = df[col].fillna(0.0)

print("\nMissing values after handling:")
print(df.isnull().sum().sum())

Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']
Filling missing values in categorical column 'lead_source' with 'NA'
Filling missing values in categorical column 'industry' with 'NA'
Filling missing values in categorical column 'employment_status' with 'NA'
Filling missing values in categorical column 'location' with 'NA'
Filling missing values in numerical column 'annual_income' with 0.0

Missing values after handling:
0


In [4]:
industry_mode = df['industry'].mode()[0]
industry_counts = df['industry'].value_counts()

print("Value counts for 'industry' column:")
print(industry_counts)
print(f"\nThe most frequent observation (mode) for the column 'industry' is: {industry_mode}")

Value counts for 'industry' column:
industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

The most frequent observation (mode) for the column 'industry' is: retail


In [5]:
# ----------> Question 2

for col in categorical_cols:
    df[col] = df[col].fillna('NA')
for col in numerical_cols:
    df[col] = df[col].fillna(0.0)

print("Numerical features in the dataset:")
print(list(numerical_cols))

correlation_matrix = df[numerical_cols].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

correlation_matrix_no_diag = correlation_matrix.copy()
np.fill_diagonal(correlation_matrix_no_diag.values, 0)

max_corr = correlation_matrix_no_diag.abs().max().max()
max_corr_pair = None

for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) == max_corr:
            col1 = correlation_matrix.columns[i]
            col2 = correlation_matrix.columns[j]
            max_corr_pair = (col1, col2, correlation_matrix.iloc[i, j])
            break

print(f"\nThe two features with the biggest correlation are: {max_corr_pair[0]} and {max_corr_pair[1]}")
print(f"Correlation coefficient: {max_corr_pair[2]:.4f}")

print("\nChecking specific pairs mentioned in question:")
pairs_to_check = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

for pair in pairs_to_check:
    if pair[0] in numerical_cols and pair[1] in numerical_cols:
        corr_value = df[pair[0]].corr(df[pair[1]])
        print(f"{pair[0]} and {pair[1]}: {corr_value:.4f}")

Numerical features in the dataset:
['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']

Correlation Matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  

The two features wit

In [7]:
from sklearn.model_selection import train_test_split


In [8]:
X = df.drop('converted', axis=1) 
y = df['converted'] 

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Data split completed:")
print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(df)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(df)*100:.1f}%)")

print(f"\nTarget column 'converted' in X_train: {'converted' in X_train.columns}")
print(f"Target column 'converted' in X_val: {'converted' in X_val.columns}")
print(f"Target column 'converted' in X_test: {'converted' in X_test.columns}")

Data split completed:
Training set: 877 samples (60.0%)
Validation set: 292 samples (20.0%)
Test set: 293 samples (20.0%)

Target column 'converted' in X_train: False
Target column 'converted' in X_val: False
Target column 'converted' in X_test: False


In [9]:
print("Target distribution in each split:")
print("\nFull dataset:")
print(y.value_counts(normalize=True))

print("\nTraining set:")
print(y_train.value_counts(normalize=True))

print("\nValidation set:")
print(y_val.value_counts(normalize=True))

print("\nTest set:")
print(y_test.value_counts(normalize=True))

Target distribution in each split:

Full dataset:
converted
1    0.619015
0    0.380985
Name: proportion, dtype: float64

Training set:
converted
1    0.619156
0    0.380844
Name: proportion, dtype: float64

Validation set:
converted
1    0.619863
0    0.380137
Name: proportion, dtype: float64

Test set:
converted
1    0.617747
0    0.382253
Name: proportion, dtype: float64


In [10]:
from sklearn.metrics import mutual_info_score

categorical_vars = ['industry', 'location', 'lead_source', 'employment_status']

mi_scores = {}
for var in categorical_vars:
    mi = mutual_info_score(X_train[var], y_train)
    mi_scores[var] = round(mi, 2)

print("Mutual Information Scores (rounded to 2 decimals):")
for var, score in mi_scores.items():
    print(f"{var}: {score}")

max_mi_var = max(mi_scores, key=mi_scores.get)
max_mi_score = mi_scores[max_mi_var]

print(f"\nThe variable with the biggest mutual information score is: {max_mi_var} (score: {max_mi_score})")

Mutual Information Scores (rounded to 2 decimals):
industry: 0.01
location: 0.0
lead_source: 0.03
employment_status: 0.01

The variable with the biggest mutual information score is: lead_source (score: 0.03)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

print(f"Training set shape after preprocessing: {X_train_processed.shape}")
print(f"Validation set shape after preprocessing: {X_val_processed.shape}")

Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
Training set shape after preprocessing: (877, 27)
Validation set shape after preprocessing: (292, 27)


In [14]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_processed, y_train)

y_val_pred = model.predict(X_val_processed)

val_accuracy = accuracy_score(y_val, y_val_pred)
val_accuracy_rounded = round(val_accuracy, 2)

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Accuracy (rounded to 2 decimals): {val_accuracy_rounded}")

Validation Accuracy: 0.6849
Validation Accuracy (rounded to 2 decimals): 0.68


In [18]:
original_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Original accuracy with all features: {original_accuracy:.6f}")

feature_names = numerical_cols.copy()
ohe = preprocessor.named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(categorical_cols)
feature_names.extend(ohe_feature_names)

print(f"Total number of features: {len(feature_names)}")

Original accuracy with all features: 0.684932
Total number of features: 27


In [20]:
original_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Original accuracy with all features: {original_accuracy:.6f}")

feature_names = numerical_cols.copy()
ohe = preprocessor.named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(categorical_cols)
feature_names.extend(ohe_feature_names)

print(f"Total number of features: {len(feature_names)}")

Original accuracy with all features: 0.684932
Total number of features: 27


In [22]:
def train_without_feature(feature_to_exclude, original_feature_names):
    is_categorical = any(feature_to_exclude in cat_col for cat_col in categorical_cols)
    
    if is_categorical:
        cols_to_exclude = [f for f in original_feature_names if feature_to_exclude in f]
        print(f"Excluding categorical feature '{feature_to_exclude}': {cols_to_exclude}")
    else:
        cols_to_exclude = [feature_to_exclude]
        print(f"Excluding numerical feature '{feature_to_exclude}'")
    
    indices_to_keep = [i for i, f in enumerate(original_feature_names) if f not in cols_to_exclude]
    
    X_train_subset = X_train_processed[:, indices_to_keep]
    X_val_subset = X_val_processed[:, indices_to_keep]
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_subset, y_train)
    
    y_val_pred_subset = model.predict(X_val_subset)
    accuracy = accuracy_score(y_val, y_val_pred_subset)
    
    return accuracy

In [23]:
features_to_test = ['industry', 'employment_status', 'lead_score']

accuracy_differences = {}

for feature in features_to_test:
    print(f"\n=== Testing without feature: {feature} ===")
    
    accuracy_without_feature = train_without_feature(feature, feature_names)
    
    difference = original_accuracy - accuracy_without_feature
    
    accuracy_differences[feature] = {
        'accuracy_without': accuracy_without_feature,
        'difference': difference
    }
    
    print(f"Accuracy without {feature}: {accuracy_without_feature:.6f}")
    print(f"Difference from original: {difference:.6f}")


=== Testing without feature: industry ===
Excluding categorical feature 'industry': ['industry_education', 'industry_finance', 'industry_healthcare', 'industry_manufacturing', 'industry_other', 'industry_retail', 'industry_technology']
Accuracy without industry: 0.684932
Difference from original: 0.000000

=== Testing without feature: employment_status ===
Excluding categorical feature 'employment_status': ['employment_status_employed', 'employment_status_self_employed', 'employment_status_student', 'employment_status_unemployed']
Accuracy without employment_status: 0.681507
Difference from original: 0.003425

=== Testing without feature: lead_score ===
Excluding numerical feature 'lead_score'
Accuracy without lead_score: 0.678082
Difference from original: 0.006849


In [24]:
print("\n" + "="*60)
print("FEATURE ELIMINATION RESULTS")
print("="*60)
print(f"Original accuracy: {original_accuracy:.6f}\n")

for feature in features_to_test:
    diff = accuracy_differences[feature]['difference']
    acc_without = accuracy_differences[feature]['accuracy_without']
    print(f"{feature:20} | Accuracy without: {acc_without:.6f} | Difference: {diff:+.6f}")

feature_with_smallest_diff = min(accuracy_differences.keys(), 
                                key=lambda x: abs(accuracy_differences[x]['difference']))

smallest_diff = accuracy_differences[feature_with_smallest_diff]['difference']
smallest_abs_diff = abs(smallest_diff)

print(f"\nThe feature with the smallest difference is: '{feature_with_smallest_diff}'")
print(f"Difference: {smallest_diff:.6f} (absolute: {smallest_abs_diff:.6f})")


FEATURE ELIMINATION RESULTS
Original accuracy: 0.684932

industry             | Accuracy without: 0.684932 | Difference: +0.000000
employment_status    | Accuracy without: 0.681507 | Difference: +0.003425
lead_score           | Accuracy without: 0.678082 | Difference: +0.006849

The feature with the smallest difference is: 'industry'
Difference: 0.000000 (absolute: 0.000000)


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

C_values = [0.01, 0.1, 1, 10, 100]

results = {}

print("Training regularized logistic regression models with different C values:")
print("=" * 60)

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_processed, y_train)
    
    y_val_pred = model.predict(X_val_processed)
    
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_accuracy_rounded = round(val_accuracy, 3)
    
    results[C] = {
        'accuracy': val_accuracy,
        'accuracy_rounded': val_accuracy_rounded
    }
    
    print(f"C = {C:5} | Validation Accuracy: {val_accuracy:.6f} | Rounded: {val_accuracy_rounded:.3f}")

Training regularized logistic regression models with different C values:
C =  0.01 | Validation Accuracy: 0.688356 | Rounded: 0.688
C =   0.1 | Validation Accuracy: 0.681507 | Rounded: 0.682
C =     1 | Validation Accuracy: 0.684932 | Rounded: 0.685
C =    10 | Validation Accuracy: 0.684932 | Rounded: 0.685
C =   100 | Validation Accuracy: 0.684932 | Rounded: 0.685


In [27]:
best_accuracy = -1
best_C = None

for C, result in results.items():
    if result['accuracy'] > best_accuracy:
        best_accuracy = result['accuracy']
        best_C = C

print(f"\nBest C value: {best_C}")
print(f"Best validation accuracy: {best_accuracy:.6f}")
print(f"Best validation accuracy (rounded to 3 decimals): {round(best_accuracy, 3)}")


Best C value: 0.01
Best validation accuracy: 0.688356
Best validation accuracy (rounded to 3 decimals): 0.688


In [28]:

accuracy_groups = {}

for C, result in results.items():
    rounded_acc = result['accuracy_rounded']
    if rounded_acc not in accuracy_groups:
        accuracy_groups[rounded_acc] = []
    accuracy_groups[rounded_acc].append(C)

print("\nC values grouped by rounded accuracy:")
for acc, C_list in sorted(accuracy_groups.items(), reverse=True):
    C_list_sorted = sorted(C_list)
    print(f"Accuracy {acc}: C values {C_list_sorted}")
    
    if len(C_list) > 1:
        print(f"  -> Multiple C values achieve this accuracy. Smallest C: {C_list_sorted[0]}")


C values grouped by rounded accuracy:
Accuracy 0.688: C values [0.01]
Accuracy 0.685: C values [1, 10, 100]
  -> Multiple C values achieve this accuracy. Smallest C: 1
Accuracy 0.682: C values [0.1]


In [29]:
best_rounded_accuracy = max(accuracy_groups.keys())
best_C_candidates = accuracy_groups[best_rounded_accuracy]
best_C_final = min(best_C_candidates)  

print(f"\nFINAL RESULT:")
print(f"Best rounded accuracy: {best_rounded_accuracy}")
print(f"C values achieving this accuracy: {best_C_candidates}")
print(f"Selected C (smallest among best): {best_C_final}")


FINAL RESULT:
Best rounded accuracy: 0.688
C values achieving this accuracy: [0.01]
Selected C (smallest among best): 0.01
