In [1]:
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

In [31]:
len(df)

1462

In [2]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [119]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [69]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)


### Question 1
What is the most frequent observation (mode) for the column industry?<br>

NA<br>
technology<br>
healthcare<br>
retail<br>

In [70]:
df['industry'].mode()[0]

'retail'

#### Answer : Retail

### Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.<br>

What are the two features that have the biggest correlation?<br>

interaction_count and lead_score<br>
number_of_courses_viewed and lead_score<br>
number_of_courses_viewed and interaction_count<br>
annual_income and interaction_count<br>

Only consider the pairs above when answering this question.<br>

In [71]:
coeff = df_filled.select_dtypes(include=[np.number]).corr()

In [72]:
pairs = [
    ("interaction_count", "lead_score"),
    ("number_of_courses_viewed", "lead_score"),
    ("number_of_courses_viewed", "interaction_count"),
    ("annual_income", "interaction_count"),
]

In [73]:
for a, b in pairs:
    print(f"Correlation between {a} and {b}: {coeff.loc[a, b]:.3f}")

Correlation between interaction_count and lead_score: 0.010
Correlation between number_of_courses_viewed and lead_score: -0.005
Correlation between number_of_courses_viewed and interaction_count: -0.024
Correlation between annual_income and interaction_count: 0.027


#### Answer : 'annual_income' and 'interaction-count'

### Split the data
Split your data in train/val/test sets with 60%/20%/20% distribution.<br>
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.<br>
Make sure that the target value y is not in your dataframe.<br>

In [74]:
from sklearn.model_selection import train_test_split

In [75]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [76]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [77]:
y_train = df_train.converted.values
y_val = df_val.converted.values

In [78]:
del df_train['converted']
del df_val['converted']

### Question 3
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.<br>
Round the scores to 2 decimals using round(score, 2).<br><br>
Which of these variables has the biggest mutual information score?

industry<br>
location<br>
lead_source<br>
employment_status

In [79]:
from sklearn.feature_selection import mutual_info_classif

cat_cols = df_train[categorical]

def calc_mi(series):
    return mutual_info_classif(series.values.reshape(-1, 1), y_train, discrete_features=True)[0]

mi_scores = {}
for col in cat_cols:
    encoded_col = pd.factorize(df_train[col])[0]
    score = mutual_info_classif(encoded_col.reshape(-1, 1), y_train, discrete_features=True, random_state=42)[0]
    mi_scores[col] = round(score, 2)

mi_sorted = sorted(mi_scores.items(), key=lambda x: x[1], reverse=True)

print("Mutual Information Scores:")
for col, score in mi_sorted:
    print(f"{col}: {score}")


Mutual Information Scores:
lead_source: 0.04
industry: 0.01
employment_status: 0.01
location: 0.0


#### Answer : lead_source

### Question 4
Now let's train a logistic regression.<br>
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.<br>
Fit the model on the training dataset.<br>
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:<br>
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)<br>
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.<br>
What accuracy did you get?<br>

0.64<br>
0.74<br>
0.84<br>
0.94

In [80]:
from sklearn.feature_extraction import DictVectorizer

In [91]:
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')

In [82]:
train_dict[0]

{'lead_source': 'paid_ads',
 'industry': 'retail',
 'number_of_courses_viewed': 0,
 'annual_income': 58472.0,
 'employment_status': 'student',
 'location': 'middle_east',
 'interaction_count': 5,
 'lead_score': 0.03}

In [93]:
dv = DictVectorizer()
dv.fit(train_dict)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,True
,sort,True


In [94]:
X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)

In [95]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [107]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [108]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [109]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [112]:
y_prob = model.predict_proba(X_val)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)
accuracy = round(accuracy_score(y_val, y_pred), 2)

In [113]:
print(f"Validation accuracy: {accuracy}")

Validation accuracy: 0.7
ANSWER Q4: 0.7


#### Answer : 0.74

### Question 5
Let's find the least useful feature using the feature elimination technique.<br>
Train a model using the same features and parameters as in Q4 (without rounding).<br>
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.<br>
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.<br>
Which of following feature has the smallest difference?<br>

'industry'<br>
'employment_status'<br>
'lead_score'<br>

In [123]:
original_accuracy = accuracy_score(y_val, model.predict(X_val))
print(f"Original model accuracy: {original_accuracy:.4f}")

features = categorical + numerical

differences = {}

Original model accuracy: 0.6997


In [124]:
for feature in ['industry', 'employment_status', 'lead_score']:
    reduced_features = [f for f in features if f != feature]
    
    train_dict_red = df_train[reduced_features].to_dict(orient='records')
    val_dict_red = df_val[reduced_features].to_dict(orient='records')
    
    dv_red = DictVectorizer()
    X_train_red = dv_red.fit_transform(train_dict_red)
    X_val_red = dv_red.transform(val_dict_red)
    
    model_red = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_red.fit(X_train_red, y_train)
    
    red_accuracy = accuracy_score(y_val, model_red.predict(X_val_red))
    difference = abs(original_accuracy - red_accuracy)
    differences[feature] = difference
    print(f"Without '{feature}': accuracy = {red_accuracy:.4f}, difference = {difference:.4f}")

Without 'industry': accuracy = 0.6997, difference = 0.0000
Without 'employment_status': accuracy = 0.6962, difference = 0.0034
Without 'lead_score': accuracy = 0.7065, difference = 0.0068


#### Answer : industry

### Question 6
Now let's train a regularized logistic regression.<br>
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].<br>
Train models using all the features as in Q4.<br>
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.<br>
Which of these C leads to the best accuracy on the validation set?<br>

0.01<br>
0.1<br>
1<br>
10<br>
100<br>

In [125]:
best_accuracy = 0
best_C = None

In [126]:
for C in [0.01, 0.1, 1, 10, 100]:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train, y_train)
    
    accuracy_reg = accuracy_score(y_val, model_reg.predict(X_val))
    print(f"C = {C}: accuracy = {accuracy_reg:.3f}")
    
    if accuracy_reg > best_accuracy:
        best_accuracy = accuracy_reg
        best_C = C
    elif accuracy_reg == best_accuracy and C < best_C:
        best_C = C

C = 0.01: accuracy = 0.700
C = 0.1: accuracy = 0.700
C = 1: accuracy = 0.700
C = 10: accuracy = 0.700
C = 100: accuracy = 0.700


In [128]:
best_C

0.01

#### Answer : 0.01