# <div style="font-family: 'Playfair Display', serif; font-weight:bold; letter-spacing: 0px; color:#FAF0EF; font-size:150%; text-align:center; padding:10px; background:#BEBF50; border-radius: 10px; box-shadow: 10px 10px 5px #ddad0fff;">🌟Homework 3 : CLASSIFICATION <br></div>


## ⚡ Overview

This dataset, **`course_lead_scoring.csv`**, contains information about **potential students (leads)** who interacted with an **online course platform**.

The goal is to **predict whether a lead will actually enroll in a course** (i.e., become a paying student).  
This process is called **lead scoring** in marketing and sales.

### 💫 Columns Description

| Column Name           | Description |
|------------------------|-------------|
| `source`               | Where the lead came from (ads, Google search, referral, etc.) |
| `country`              | The lead’s country of origin |
| `age`                  | The person’s age |
| `converted`            | Target variable — `1` if the person enrolled, `0` otherwise |
| `total_pages_visited`  | Number of pages visited on the website |
| `time_on_site`         | Time spent on the website (in minutes) |
| `ads_clicked`          | Number of ads the person clicked |
| `email_opened`         | Whether the person opened marketing emails |


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

In [5]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [21]:
df.shape

(1462, 9)

In [22]:
df.describe()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
count,1462.0,1462.0,1462.0,1462.0,1462.0
mean,2.031464,52472.172367,2.976744,0.506108,0.619015
std,1.449717,24254.34703,1.681564,0.288465,0.485795
min,0.0,0.0,0.0,0.0,0.0
25%,1.0,44097.25,2.0,0.2625,0.0
50%,2.0,57449.5,3.0,0.51,1.0
75%,3.0,68241.0,4.0,0.75,1.0
max,9.0,109899.0,11.0,1.0,1.0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1462 non-null   object 
 1   industry                  1462 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1462 non-null   float64
 4   employment_status         1462 non-null   object 
 5   location                  1462 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [15]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Lead Scoring Dataset Report")
profile.to_file("report.html")

100%|██████████| 9/9 [00:00<00:00, 23.78it/s]01<00:00, 11.35it/s, Describe variable: converted] 
Summarize dataset: 100%|██████████| 34/34 [00:14<00:00,  2.42it/s, Completed]                                                 
Generate report structure: 100%|██████████| 1/1 [00:17<00:00, 17.91s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.34it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 33.07it/s]


In [13]:
import os
print(os.getcwd())


d:\personal-portfolio-master\Machine_Learning_Zoomcamp-\03-CLASSIFICATION


In [17]:
profile.to_file("report.html")



Export report to file: 100%|██████████| 1/1 [00:00<00:00, 61.11it/s]


In [18]:
import webbrowser
webbrowser.open("report.html")


True

<a id="6"></a>
# <div style="font-family: 'Playfair Display', serif; font-weight:bold; letter-spacing: 1px; color:#FAF0EF; font-size:150%; text-align:left; padding:10px; background:#ddad0fff; border-radius: 10px;">📒 Question 1
</div>

In [26]:
for c in df.columns:
    if df[c].dtype == 'object':
        df[c] = df[c].fillna('NA')
    else:
        df[c] = df[c].fillna(0.0)

In [28]:
print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values after handling:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [29]:
df['industry'].mode()[0]

'retail'

In [30]:

# Check missing values
print(df.isnull().sum())

# Fill missing values
cat_cols = df.select_dtypes(include='object').columns
num_cols = df.select_dtypes(include=np.number).columns

df[cat_cols] = df[cat_cols].fillna('NA')
df[num_cols] = df[num_cols].fillna(0.0)

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


### 🎯Replace textual "NA" values with NaN

In [35]:
import pandas as pd

# Remplacer les valeurs textuelles "NA" par NaN
df.replace(["NA", "N/A", "na", "NaN", " "], pd.NA, inplace=True)

# Parcourir toutes les colonnes
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        # Colonne numérique → moyenne
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        # Colonne catégorielle → valeur la plus fréquente
        df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [36]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,retail,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,self_employed,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,retail,5,71016.0,self_employed,north_america,0,0.25,1


In [37]:
df['industry'].value_counts()

industry
retail           337
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
Name: count, dtype: int64

🧾 'retail'

<a id="6"></a>
# <div style="font-family: 'Playfair Display', serif; font-weight:bold; letter-spacing: 1px; color:#FAF0EF; font-size:150%; text-align:left; padding:10px; background:#ddad0fff; border-radius: 10px;">📒 Question 2
</div>

## ✨ Correlation matrix

In [None]:
 
numerical_cols = ['annual_income', 'lead_score', 'interaction_count', 'number_of_courses_viewed']
numerical_df = df[numerical_cols]
corr_matrix = numerical_df.corr()

print("\nQuestion 2 - Correlation Matrix:")
print(corr_matrix)

# Find the pair with highest correlation from the given options
pairs_to_check = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

max_corr = 0
max_pair = None
for col1, col2 in pairs_to_check:
    corr_value = corr_matrix.loc[col1, col2]
    print(f"Correlation between {col1} and {col2}: {corr_value:.4f}")
    if abs(corr_value) > abs(max_corr):
        max_corr = corr_value
        max_pair = (col1, col2)

print(f"\nHighest correlation pair: {max_pair} with value {max_corr:.4f}")


Question 2 - Correlation Matrix:
                          annual_income  lead_score  interaction_count  \
annual_income                  1.000000    0.015610           0.027036   
lead_score                     0.015610    1.000000           0.009888   
interaction_count              0.027036    0.009888           1.000000   
number_of_courses_viewed       0.009770   -0.004879          -0.023565   

                          number_of_courses_viewed  
annual_income                             0.009770  
lead_score                               -0.004879  
interaction_count                        -0.023565  
number_of_courses_viewed                  1.000000  
Correlation between interaction_count and lead_score: 0.0099
Correlation between number_of_courses_viewed and lead_score: -0.0049
Correlation between number_of_courses_viewed and interaction_count: -0.0236
Correlation between annual_income and interaction_count: 0.0270

Highest correlation pair: ('annual_income', 'interaction_co

In [39]:
df.corr(numeric_only=True)


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


<a id="6"></a>
# <div style="font-family: 'Playfair Display', serif; font-weight:bold; letter-spacing: 1px; color:#FAF0EF; font-size:150%; text-align:left; padding:10px; background:#ddad0fff; border-radius: 10px;">📒 Question 3
</div>

Mutual information with target

In [40]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

y_train = df_train['converted']
y_val = df_val['converted']
y_test = df_test['converted']

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [41]:
from sklearn.metrics import mutual_info_score

categorical = ['industry', 'location', 'lead_source', 'employment_status']

def mutual_info_chisq(y, x):
    return mutual_info_score(y, x)

mi_scores = []
for col in categorical:
    mi = mutual_info_chisq(y_train, df_train[col])
    mi_scores.append((col, round(mi, 2)))

mi_scores.sort(key=lambda x: x[1], reverse=True)

In [42]:
print("\nMutual Information Scores:")
for col, score in mi_scores:
    print(f"{col}: {score}")


Mutual Information Scores:
lead_source: 0.03
industry: 0.01
employment_status: 0.01
location: 0.0


<a id="6"></a>
# <div style="font-family: 'Playfair Display', serif; font-weight:bold; letter-spacing: 1px; color:#FAF0EF; font-size:150%; text-align:left; padding:10px; background:#ddad0fff; border-radius: 10px;">📒 Question 4
</div>

 Logistic regression accuracy

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# One-hot encoding
df_train_encoded = pd.get_dummies(df_train, columns=categorical)
df_val_encoded = pd.get_dummies(df_val, columns=categorical)

# Align columns (ensure train and val have same dummy columns)
df_val_encoded = df_val_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(df_train_encoded, y_train)

y_pred = model.predict(df_val_encoded)
accuracy = accuracy_score(y_val, y_pred)
print(round(accuracy, 2))

0.7


<a id="6"></a>
# <div style="font-family: 'Playfair Display', serif; font-weight:bold; letter-spacing: 1px; color:#FAF0EF; font-size:150%; text-align:left; padding:10px; background:#ddad0fff; border-radius: 10px;">📒 Question 5
</div>

Feature elimination

In [44]:
original_accuracy = accuracy  # from Q4

features = list(df_train_encoded.columns)
differences = []

for feature in features:
    # Remove one feature
    X_train_reduced = df_train_encoded.drop(columns=[feature])
    X_val_reduced = df_val_encoded.drop(columns=[feature])
    
    model.fit(X_train_reduced, y_train)
    y_pred_reduced = model.predict(X_val_reduced)
    acc_reduced = accuracy_score(y_val, y_pred_reduced)
    
    diff = original_accuracy - acc_reduced
    differences.append((feature, diff))

# Sort by absolute difference
differences.sort(key=lambda x: abs(x[1]))

In [19]:
differences

[('industry_NA', 0.0, 0.0),
 ('industry_education', 0.0, 0.0),
 ('industry_finance', 0.0, 0.0),
 ('industry_healthcare', 0.0, 0.0),
 ('industry_manufacturing', 0.0, 0.0),
 ('industry_other', 0.0, 0.0),
 ('industry_retail', 0.0, 0.0),
 ('industry_technology', 0.0, 0.0),
 ('location_NA', 0.0, 0.0),
 ('location_africa', 0.0, 0.0),
 ('location_asia', 0.0, 0.0),
 ('location_australia', 0.0, 0.0),
 ('location_europe', 0.0, 0.0),
 ('location_middle_east', 0.0, 0.0),
 ('location_north_america', 0.0, 0.0),
 ('location_south_america', 0.0, 0.0),
 ('lead_source_NA', 0.0, 0.0),
 ('lead_source_events', 0.0, 0.0),
 ('lead_source_organic_search', 0.0, 0.0),
 ('lead_source_social_media', 0.0, 0.0),
 ('employment_status_NA', 0.0, 0.0),
 ('employment_status_employed', 0.0, 0.0),
 ('employment_status_self_employed', 0.0, 0.0),
 ('employment_status_student', 0.0, 0.0),
 ('employment_status_unemployed', 0.0, 0.0),
 ('lead_source_referral', 0.0034129692832763903, 0.0034129692832763903),
 ('lead_score', -0.0

<a id="6"></a>
# <div style="font-family: 'Playfair Display', serif; font-weight:bold; letter-spacing: 1px; color:#FAF0EF; font-size:150%; text-align:left; padding:10px; background:#ddad0fff; border-radius: 10px;"> 📒Question 6
</div>

Regularized logistic regression

In [45]:
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(df_train_encoded, y_train)
    y_pred = model.predict(df_val_encoded)
    acc = accuracy_score(y_val, y_pred)
    print(f"C={C}, Accuracy={round(acc, 3)}")
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_C = C

print(f"Best C: {best_C}")

C=0.01, Accuracy=0.696
C=0.1, Accuracy=0.696
C=1, Accuracy=0.696
C=10, Accuracy=0.696
C=100, Accuracy=0.696
Best C: 0.01


<a id="6"></a>
# <div style="font-family: 'Playfair Display', serif; font-weight:bold; letter-spacing: 1px; color:#FAF0EF; font-size:150%; text-align:left; padding:10px; background:#ddad0fff; border-radius: 10px;">🔥 end by Amira mohammedi
</div>