In [60]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [61]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [62]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [63]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [64]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [65]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [66]:
import pandas as pd

def fill_missing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Numerical → 0.0
    df["annual_income"] = df["annual_income"].fillna(0)

    # Categorical/text → "NA"
    # (Cast to string dtype so "NA" is valid and consistent)
    df[['lead_source', 'industry', 'employment_status', 'location']] = df[['lead_source', 'industry', 'employment_status', 'location']].fillna("NA")

    return df

In [67]:
df_withoutnas=fill_missing(df)

In [68]:
df_withoutnas.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [69]:
df_withoutnas.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [70]:
import pandas as pd
import numpy as np

def correlation_matrix(df: pd.DataFrame, method: str = "pearson") -> pd.DataFrame:
    """
    Returns the correlation matrix for numeric columns.
    method: 'pearson' (linear), 'spearman' (rank), or 'kendall'
    """
    # Keep numeric columns only
    num = df[["interaction_count","lead_score","number_of_courses_viewed","annual_income"]]

    # Drop constant columns (all same value) to avoid all-NaN rows/cols in corr
    nunique = num.nunique(dropna=False)
    num = num.loc[:, nunique.gt(1)]

    # Compute correlation
    return num.corr(method=method)


corr = correlation_matrix(df_withoutnas, method="pearson")
print(corr)


                          interaction_count  lead_score  \
interaction_count                  1.000000    0.009888   
lead_score                         0.009888    1.000000   
number_of_courses_viewed          -0.023565   -0.004879   
annual_income                      0.027036    0.015610   

                          number_of_courses_viewed  annual_income  
interaction_count                        -0.023565       0.027036  
lead_score                               -0.004879       0.015610  
number_of_courses_viewed                  1.000000       0.009770  
annual_income                             0.009770       1.000000  


In [71]:
df_withoutnas

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [72]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df_withoutnas, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values



In [73]:
from sklearn.metrics import mutual_info_score
mutual_info_score(df_train.industry, df_train.converted)

np.float64(0.011574521435657112)

In [74]:
mutual_info_score(df_train.location, df_train.converted)

np.float64(0.004464157884038034)

In [75]:
mutual_info_score(df_train.lead_source, df_train.converted)

np.float64(0.03539624379726594)

In [76]:
mutual_info_score(df_train.employment_status, df_train.converted)

np.float64(0.012937677269442782)

In [77]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [78]:
df_withoutnas.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [79]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count',"lead_score"]

In [80]:
categorical = [
    'lead_source',
    'industry',
    'employment_status',
    'location']

In [81]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]

df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = (y_pred >= 0.5)
df_pred['actual'] = y_val

df_pred['correct'] = df_pred.prediction == df_pred.actual

df_pred.correct.mean()

np.float64(0.6996587030716723)

In [82]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# --- config: same as Q4 ---
FEATURES = categorical + numerical
LR_KWARGS = dict(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
THRESH = 0.5

def train_eval_accuracy(features):
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[features].to_dict(orient='records')
    val_dict = df_val[features].to_dict(orient='records')

    X_train = dv.fit_transform(train_dict)
    X_val = dv.transform(val_dict)

    model = LogisticRegression(**LR_KWARGS)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    preds = (y_pred >= THRESH)
    acc = (preds == y_val).mean()
    return acc

# Baseline with all features
baseline_acc = train_eval_accuracy(FEATURES)
print(f"Baseline accuracy (all features): {baseline_acc:.5f}")

# Leave-one-feature-out
rows = []
for f in ["industry",'employment_status','lead_score']:
    feats_minus = [x for x in FEATURES if x != f]
    print(feats_minus)
    acc_minus = train_eval_accuracy(feats_minus)
    diff = baseline_acc - acc_minus  # accuracy drop when removing f (positive drop => feature is useful)
    rows.append({"feature": f, "acc_without_feature": acc_minus, "accuracy_drop": diff})

results = pd.DataFrame(rows).sort_values("accuracy_drop", ascending=True).reset_index(drop=True)

print("\nLeast useful features first (smallest accuracy drop; negative means removal helped):")
print(results.to_string(index=False))

# If you want just the single least useful feature:
least_useful = results.iloc[0]
print(f"\nLeast useful feature: {least_useful['feature']} "
      f"(acc_without={least_useful['acc_without_feature']:.5f}, "
      f"drop={least_useful['accuracy_drop']:.5f})")


Baseline accuracy (all features): 0.69966
['lead_source', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
['lead_source', 'industry', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
['lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count']

Least useful features first (smallest accuracy drop; negative means removal helped):
          feature  acc_without_feature  accuracy_drop
       lead_score             0.706485      -0.006826
         industry             0.699659       0.000000
employment_status             0.696246       0.003413

Least useful feature: lead_score (acc_without=0.70648, drop=-0.00683)


In [83]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

for x in [0.01, 0.1, 1, 10, 100]:
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(solver='liblinear', C=x, max_iter=1000, random_state=42)
    # solver='lbfgs' is the default solver in newer version of sklearn
    # for older versions, you need to specify it explicitly
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = (y_pred >= 0.5)
    df_pred['actual'] = y_val
    
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    print(x)
    print(round(df_pred.correct.mean(), 5))

0.01
0.69966
0.1
0.69966
1
0.69966
10
0.69966
100
0.69966
