# Ejercicio Clasificación

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = "https://raw.githubusercontent.com/bigdatadatafan/datasets-clase/refs/heads/main/course_lead_scoring.csv"

In [3]:
df = pd.read_csv(data, index_col=False)
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [4]:
n = df.select_dtypes(include=[np.number])
char = df.select_dtypes(exclude=[np.number])

df[n.columns] = n.fillna(0)
df[char.columns] = char.fillna("NA")
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


# Apartado 1

In [5]:
df.industry.mode()

0    retail
Name: industry, dtype: object

# Apartado 2

In [6]:
n.corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.031551,-0.023565,-0.004879,0.435914
annual_income,0.031551,1.0,0.048618,0.005334,0.078256
interaction_count,-0.023565,0.048618,1.0,0.009888,0.374573
lead_score,-0.004879,0.005334,0.009888,1.0,0.193673
converted,0.435914,0.078256,0.374573,0.193673,1.0


# Apartado 3

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

X = df.drop(columns=["converted"])
y = df.converted

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

m_industry = mutual_info_score(y_train, X_train.industry)
m_location = mutual_info_score(y_train, X_train.location)
m_employment_status = mutual_info_score(y_train, X_train.employment_status)
m_lead_source = mutual_info_score(y_train, X_train.lead_source)

In [8]:
m_industry

0.01505912124652415

In [9]:
m_location

0.0034567021901497005

In [10]:
m_employment_status

0.018339335284652453

In [11]:
m_lead_source

0.02819239367832531

# Apartado 4

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer

encoder = LabelEncoder()

dv = DictVectorizer(sparse=False)

X_train_tmp = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_tmp = dv.fit_transform(X_val.to_dict(orient="records"))

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(X_train_tmp, y_train)

y_pred = model.predict(X_val_tmp)


original_score = accuracy_score(y_val, y_pred)
original_score


0.6962457337883959

# Apartado 5

In [16]:
X_train_tmp = dv.fit_transform(X_train.drop(columns=["industry"]).to_dict(orient="records"))
X_val_tmp = dv.fit_transform(X_val.drop(columns=["industry"]).to_dict(orient="records"))

industry_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
industry_model.fit(X_train_tmp,y_train)

industry_y_pred = industry_model.predict(X_val_tmp)
industry_score = accuracy_score(y_val, y_pred)
industry_score

0.6962457337883959

In [21]:
X_train_tmp = dv.fit_transform(X_train.drop(columns=["employment_status"]).to_dict(orient="records"))
X_val_tmp = dv.fit_transform(X_val.drop(columns=["employment_status"]).to_dict(orient="records"))

emp_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
emp_model.fit(X_train_tmp, y_train)

emp_y_pred = emp_model.predict(X_val_tmp)
emp_score = accuracy_score(y_val, y_pred)
emp_score

0.6962457337883959

In [22]:
X_train_tmp = dv.fit_transform(X_train.drop(columns=["lead_score"]).to_dict(orient="records"))
X_val_tmp = dv.fit_transform(X_val.drop(columns=["lead_score"]).to_dict(orient="records"))

ld_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
ld_model.fit(X_train_tmp,y_train)

ld_y_pred = ld_model.predict(X_val_tmp)
ld_score = accuracy_score(y_val, y_pred)
ld_score

0.6962457337883959

# Apartado 6

In [25]:
C = [0.01, 0.1, 1, 10, 100]
results = []
X_train_tmp = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_tmp = dv.fit_transform(X_val.to_dict(orient="records"))
for c in C:
    c_model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    c_model.fit(X_train_tmp,y_train)
    c_pred = c_model.predict(X_val_tmp)
    c_score = accuracy_score(y_val,c_pred)
    results.append(c_score)
results

[0.7064846416382252,
 0.6962457337883959,
 0.6962457337883959,
 0.6962457337883959,
 0.6962457337883959]