# 0: Installing Prerequisites

In [28]:
import pandas as pd
import numpy as np

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mutual_info_score

#0.1 Loading Dataset

In [32]:
print("Loading Dataset...")
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")
print("Dataset for homework 3 loaded successfully!")

Loading Dataset...
Dataset for homework 3 loaded successfully!


Identifying the categorical and numerical columns

In [33]:
print(f"Dataset shape: {df.shape}")

Dataset shape: (1462, 9)


In [36]:
categorical_columns = list(df.columns[df.dtypes == 'object'])
numerical_columns = list(df.columns[df.dtypes != 'object'])
print(f"Categorical columns: {categorical_columns}")
print(f"Numerical columns: {numerical_columns}")

Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']


In [41]:
print(df.isnull().sum())

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


Fill missing values

In [42]:
df[categorical_columns] = df[categorical_columns].fillna('NA')
df[numerical_columns] = df[numerical_columns].fillna(0.0)

In [43]:
print("Missing values after handling:")
print(df.isnull().sum())

Missing values after handling:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


# Q1: Mode for industry

In [44]:
industry_mode = df['industry'].mode()[0]
print(f"Most frequent observation (mode) for 'industry': {industry_mode}")
print(f"Answer Q1: {industry_mode}")

Most frequent observation (mode) for 'industry': retail
Answer Q1: retail


#Q2: Biggest Correlation

In [46]:
numerical_features = ['number_of_courses_viewed', 'annual_income',
                      'interaction_count', 'lead_score', 'converted']
correlation_matrix = df[numerical_features].corr()
print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  


In [49]:
corr_1 = df['interaction_count'].corr(df['lead_score'])
corr_2 = df['number_of_courses_viewed'].corr(df['lead_score'])
corr_3 = df['number_of_courses_viewed'].corr(df['interaction_count'])
corr_4 = df['annual_income'].corr(df['interaction_count'])

In [50]:
print("Correlations for given pairs:")
print(f"interaction_count and lead_score: {corr_1:.6f}")
print(f"number_of_courses_viewed and lead_score: {corr_2:.6f}")
print(f"number_of_courses_viewed and interaction_count: {corr_3:.6f}")
print(f"annual_income and interaction_count: {corr_4:.6f}")

Correlations for given pairs:
interaction_count and lead_score: 0.009888
number_of_courses_viewed and lead_score: -0.004879
number_of_courses_viewed and interaction_count: -0.023565
annual_income and interaction_count: 0.027036


In [56]:
correlations = {
    'interaction_count and lead_score': abs(corr_1),
    'number_of_courses_viewed and lead_score': abs(corr_2),
    'number_of_courses_viewed and interaction_count': abs(corr_3),
    'annual_income and interaction_count': abs(corr_4)
}

max_corr_pair = max(correlations, key=correlations.get)
print(f"Pair with biggest correlation (absolute value): {max_corr_pair}")
print(f"Correlation value: {correlations[max_corr_pair]:.6f}")
print(f"Answer Q2: {max_corr_pair}")

Pair with biggest correlation (absolute value): annual_income and interaction_count
Correlation value: 0.027036
Answer Q2: annual_income and interaction_count


# Data Split: 60% train, 20% validation, 20% test

In [61]:
# First split: 80% full_train, 20% test
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Second split: 75% train (60% of original), 25% validation (20% of original)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

print(f"Full dataset: {len(df)} ({100:.1f}%)")
print(f"Train set: {len(df_train)} ({len(df_train)/len(df)*100:.1f}%)")
print(f"Validation set: {len(df_val)} ({len(df_val)/len(df)*100:.1f}%)")
print(f"Test set: {len(df_test)} ({len(df_test)/len(df)*100:.1f}%)")

Full dataset: 1462 (100.0%)
Train set: 876 (59.9%)
Validation set: 293 (20.0%)
Test set: 293 (20.0%)


In [62]:
# Separate target variable
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

# Remove target from feature dataframes
df_train = df_train.drop('converted', axis=1)
df_val = df_val.drop('converted', axis=1)
df_test = df_test.drop('converted', axis=1)

print("Target variable separated and removed from feature sets.")

Target variable separated and removed from feature sets.


#Q3: Biggest mutual information score

In [69]:
categorical_vars = ['industry', 'location', 'lead_source', 'employment_status']

mi_scores = {}
for cat_var in categorical_vars:
    mi_score = mutual_info_score(df_train[cat_var], y_train)
    mi_scores[cat_var] = round(mi_score, 2)
    print(f"Mutual Information Score for '{cat_var}': {mi_scores[cat_var]}")

max_mi_var = max(mi_scores, key=mi_scores.get)
print(f"Variable with biggest MI score: {max_mi_var}")
print(f"Answer Q3: {max_mi_var}")

Mutual Information Score for 'industry': 0.01
Mutual Information Score for 'location': 0.0
Mutual Information Score for 'lead_source': 0.04
Mutual Information Score for 'employment_status': 0.01
Variable with biggest MI score: lead_source
Answer Q3: lead_source


#Q4: Logistic Regression Accuracy

In [78]:
# Prepare data for logistic regression with one-hot encoding
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')

# Initialize and fit DictVectorizer
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

print(f"Training features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")

# Train Logistic Regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
accuracy_rounded = round(accuracy, 2)

print(f"Validation Accuracy: {accuracy:.6f}")
print(f"Validation Accuracy (rounded to 2 decimals): {accuracy_rounded}")
print(f"Answer Q4: {accuracy_rounded}")

Training features shape: (876, 31)
Validation features shape: (293, 31)
Validation Accuracy: 0.699659
Validation Accuracy (rounded to 2 decimals): 0.7
Answer Q4: 0.7


#Q5: Feature elimination

In [101]:
# Baseline accuracy from Q4
baseline_accuracy = accuracy

print(f"Baseline accuracy: {baseline_accuracy:.6f}")

# Features to test
features_to_test = ['industry', 'employment_status', 'lead_score']

# Dictionary to store differences
accuracy_differences = {}

for feature in features_to_test:
    # Create copy of train and validation data
    df_train_temp = df_train.drop(feature, axis=1)
    df_val_temp = df_val.drop(feature, axis=1)

    # Convert to dictionaries and transform
    train_dict_temp = df_train_temp.to_dict(orient='records')
    val_dict_temp = df_val_temp.to_dict(orient='records')

    dv_temp = DictVectorizer(sparse=False)
    X_train_temp = dv_temp.fit_transform(train_dict_temp)
    X_val_temp = dv_temp.transform(val_dict_temp)

    # Train model without the feature
    model_temp = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_temp.fit(X_train_temp, y_train)

    # Predict and calculate accuracy
    y_pred_temp = model_temp.predict(X_val_temp)
    accuracy_temp = accuracy_score(y_val, y_pred_temp)

    # Calculate difference
    difference = baseline_accuracy - accuracy_temp
    accuracy_differences[feature] = difference

    print(f"Feature: '{feature}'")
    print(f"  Accuracy without feature: {accuracy_temp:.6f}")
    print(f"  Difference: {difference:.6f}")

# Find feature with smallest difference (absolute value)
min_diff_feature = min(accuracy_differences, key=lambda x: abs(accuracy_differences[x]))
print(f"Feature with smallest difference: '{min_diff_feature}'")
print(f"This is the least useful feature.")
print(f"Answer Q5: '{min_diff_feature}'")

Baseline accuracy: 0.699659
Feature: 'industry'
  Accuracy without feature: 0.699659
  Difference: 0.000000
Feature: 'employment_status'
  Accuracy without feature: 0.696246
  Difference: 0.003413
Feature: 'lead_score'
  Accuracy without feature: 0.706485
  Difference: -0.006826
Feature with smallest difference: 'industry'
This is the least useful feature.
Answer Q5: 'industry'


#Q6: Parameter Tuning

In [117]:
# Test different C values for regularization
C_values = [0.01, 0.1, 1, 10, 100]

# Use the same data preparation as Q4
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

# Dictionary to store results
c_accuracies = {}

print("Testing different C values:")
for C in C_values:
    # Train model with current C value
    model_c = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_c.fit(X_train, y_train)

    # Predict and calculate accuracy
    y_pred_c = model_c.predict(X_val)
    accuracy_c = accuracy_score(y_val, y_pred_c)
    accuracy_c_rounded = round(accuracy_c, 3)

    c_accuracies[C] = accuracy_c_rounded

    print(f"C = {C}")
    print(f"  Accuracy: {accuracy_c:.6f}")
    print(f"  Accuracy (rounded to 3 decimals): {accuracy_c_rounded}")

# Find C with best accuracy (if tied, select smallest C)
max_accuracy = max(c_accuracies.values())
best_c_values = [c for c, acc in c_accuracies.items() if acc == max_accuracy]
best_c = min(best_c_values)  # Select smallest C if multiple have same accuracy

print(f"Best accuracy: {max_accuracy}")
print(f"Best C value: {best_c}")
print(f"Answer Q6: {best_c}")


Testing different C values:
C = 0.01
  Accuracy: 0.699659
  Accuracy (rounded to 3 decimals): 0.7
C = 0.1
  Accuracy: 0.699659
  Accuracy (rounded to 3 decimals): 0.7
C = 1
  Accuracy: 0.699659
  Accuracy (rounded to 3 decimals): 0.7
C = 10
  Accuracy: 0.699659
  Accuracy (rounded to 3 decimals): 0.7
C = 100
  Accuracy: 0.699659
  Accuracy (rounded to 3 decimals): 0.7
Best accuracy: 0.7
Best C value: 0.01
Answer Q6: 0.01
