In [1]:
import pandas as pd
import numpy as np

In [2]:
# Getting data from a URL
import pandas as pd

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

# Optionally inspect
print(df.head())

# Save to Excel
df.to_excel("course_lead_scoring.xlsx", index=False)


    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  


In [3]:
df=pd.read_excel("course_lead_scoring.xlsx")

In [16]:
import pandas as pd

# Read the Excel file
df = pd.read_excel("course_lead_scoring.xlsx")

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':  # Categorical features
        df[col] = df[col].fillna('NA')
    else:  # Numerical features
        df[col] = df[col].fillna(0.0)

# Verify that all missing values are handled
print("\nMissing values after filling:")
print(df.isnull().sum())


Missing values per column:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

Missing values after filling:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [17]:
df['industry'].mode()[0]

'retail'

In [18]:
# Compute the correlation matrix
corr_matrix = df.corr(numeric_only=True)

# Find the pair with the largest correlation (excluding self-correlations)
corr_unstacked = corr_matrix.unstack().sort_values(ascending=False)

# Drop self-correlations (where correlation = 1)
corr_unstacked = corr_unstacked[corr_unstacked < 1]

# Get the top correlation pair
biggest_corr = corr_unstacked.idxmax()
corr_value = corr_unstacked.max()

print(f"The biggest correlation is between {biggest_corr[0]} and {biggest_corr[1]} with a value of {corr_value:.2f}")


The biggest correlation is between converted and number_of_courses_viewed with a value of 0.44


In [19]:
import pandas as pd


# Compute correlation matrix
corr_matrix = df.corr(numeric_only=True)

# Unstack (flatten) the matrix to get all pairs
corr_pairs = corr_matrix.unstack()

# Remove duplicate pairs and self-correlations (where correlation = 1)
corr_pairs = corr_pairs.drop_duplicates().sort_values(ascending=False)

# Display all correlation scores
print(corr_pairs)


number_of_courses_viewed  number_of_courses_viewed    1.000000
                          converted                   0.435914
interaction_count         converted                   0.374573
lead_score                converted                   0.193673
annual_income             converted                   0.053131
                          interaction_count           0.027036
                          lead_score                  0.015610
interaction_count         lead_score                  0.009888
number_of_courses_viewed  annual_income               0.009770
                          lead_score                 -0.004879
                          interaction_count          -0.023565
dtype: float64


In [20]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [22]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

# Select categorical features
categorical_cols = ['industry', 'location', 'lead_source', 'employment_status']

# Drop rows with missing values in these columns or in target
data = df[categorical_cols + ['converted']].dropna()

# Convert categorical columns to numeric codes
for col in categorical_cols:
    data[col] = data[col].astype('category').cat.codes

# Compute MI
X = data[categorical_cols]
y = data['converted']
mi_scores = mutual_info_classif(X, y, discrete_features=True, random_state=1)

# Create a table of MI results
mi_df = pd.DataFrame({'Feature': categorical_cols, 'MI Score': mi_scores}).sort_values(by='MI Score', ascending=False)
print(mi_df)


             Feature  MI Score
2        lead_source  0.026574
3  employment_status  0.011070
0           industry  0.007267
1           location  0.001427


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Select features (you can use the most relevant ones)
features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
X = df[features]
y = df['converted']

# Handle missing values
X = X.fillna(X.median())

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Train a model
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

# Predict and calculate accuracy
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {acc:.2f}")


Model Accuracy: 0.77


In [24]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


# Select numeric features only for simplicity
X = df[['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']].fillna(0)
y = df['converted']

# Define pipeline: scaling + logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

# Parameter grid
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10, 100]
}

# Grid search
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid.fit(X, y)

# Best parameter and score
print("Best C:", grid.best_params_['logreg__C'])
print("Best cross-validation accuracy:", grid.best_score_)


Best C: 10
Best cross-validation accuracy: 0.8187152274533638


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Use only one feature, for example
X = df[['number_of_courses_viewed']].fillna(0)
y = df['converted']

# Simple train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Test each C
for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print(f"C={C}: Accuracy={acc:.2f}")


C=0.01: Accuracy=0.71
C=0.1: Accuracy=0.71
C=1: Accuracy=0.71
C=10: Accuracy=0.71
C=100: Accuracy=0.71
