# Explore here

In [1]:
!pip install scikit-surprise --upgrade

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv("../data/raw/adult-census-income.csv")

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
df.shape

(32561, 15)

In [6]:
#cambiamos el signo de interrogación como dato faltante para borrarlos
df.replace('?', np.nan, inplace=True)

In [7]:
#borramos los datos faltantes 
df.dropna(inplace=True)

In [8]:
df.shape

(30162, 15)

In [9]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K


In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,30162.0,38.437902,13.134665,17.0,28.0,37.0,47.0,90.0
fnlwgt,30162.0,189793.83393,105652.971529,13769.0,117627.25,178425.0,237628.5,1484705.0
education.num,30162.0,10.121312,2.549995,1.0,9.0,10.0,13.0,16.0
capital.gain,30162.0,1092.007858,7406.346497,0.0,0.0,0.0,0.0,99999.0
capital.loss,30162.0,88.372489,404.29837,0.0,0.0,0.0,0.0,4356.0
hours.per.week,30162.0,40.931238,11.979984,1.0,40.0,40.0,45.0,99.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30162 entries, 1 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30162 non-null  int64 
 1   workclass       30162 non-null  object
 2   fnlwgt          30162 non-null  int64 
 3   education       30162 non-null  object
 4   education.num   30162 non-null  int64 
 5   marital.status  30162 non-null  object
 6   occupation      30162 non-null  object
 7   relationship    30162 non-null  object
 8   race            30162 non-null  object
 9   sex             30162 non-null  object
 10  capital.gain    30162 non-null  int64 
 11  capital.loss    30162 non-null  int64 
 12  hours.per.week  30162 non-null  int64 
 13  native.country  30162 non-null  object
 14  income          30162 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [12]:
#eliminamos eeducation porque ya tenemos education.num y es lo mismo.
#eliminamos relationship porque marital.status ya contiene los datos del estado civil
df.drop(columns=['education', 'relationship'], inplace=True)

In [13]:
#tambien borramos workclass porque se relaciona con occupation, que es mas específica y está más equilibrada
df.drop(columns=['workclass'], inplace=True)

In [14]:
#hacemos one hot encoding de sex e income porque tienen dos valores.
df = pd.get_dummies(df, columns=['sex', 'income'], drop_first=True)

In [15]:
#pasamos los booleanos del one hot encoding a int
df[['sex_Male', 'income_>50K']] = df[['sex_Male', 'income_>50K']].astype(int)

In [16]:
#hacemos .map de race, marital.status y occupation para pasarlo a int
race_map = {
    'White': 0,
    'Black': 1,
    'Asian-Pac-Islander': 2,
    'Amer-Indian-Eskimo': 3,
    'Other': 4
}
marital_status_map = {
    'Married-civ-spouse': 0,
    'Never-married': 1,
    'Divorced': 2,
    'Separated': 3,
    'Widowed': 4,
    'Married-spouse-absent': 5,
    'Married-AF-spouse': 6
}
occupation_map = {
    'Prof-specialty': 0,
    'Craft-repair': 1,
    'Exec-managerial': 2,
    'Adm-clerical': 3,
    'Sales': 4,
    'Other-service': 5,
    'Machine-op-inspct': 6,
    'Transport-moving': 7,
    'Handlers-cleaners': 8,
    'Farming-fishing': 9,
    'Tech-support': 10,
    'Protective-serv': 11,
    'Priv-house-serv': 12,
    'Armed-Forces': 13
}

In [17]:
df['race'] = df['race'].map(race_map)
df['marital.status'] = df['marital.status'].map(marital_status_map)
df['occupation'] = df['occupation'].map(occupation_map)

In [18]:
df.shape

(30162, 12)

In [19]:
#vamos a eliminar las filas de native country que no sean de EEUU 
df.drop(df[df['native.country'] != 'United-States'].index, inplace=True)

In [20]:
#ahora eliminamos la columna de native country
df.drop(columns=['native.country'], inplace=True)

In [21]:
df.head()

Unnamed: 0,age,fnlwgt,education.num,marital.status,occupation,race,capital.gain,capital.loss,hours.per.week,sex_Male,income_>50K
1,82,132870,9,4,2,0,0,4356,18,0,0
3,54,140359,4,2,6,0,0,3900,40,0,0
4,41,264663,10,3,0,0,0,3900,40,0,0
5,34,216864,9,2,5,0,0,3770,45,0,0
6,38,150601,6,3,3,0,0,3770,40,1,0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27504 entries, 1 to 32560
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             27504 non-null  int64
 1   fnlwgt          27504 non-null  int64
 2   education.num   27504 non-null  int64
 3   marital.status  27504 non-null  int64
 4   occupation      27504 non-null  int64
 5   race            27504 non-null  int64
 6   capital.gain    27504 non-null  int64
 7   capital.loss    27504 non-null  int64
 8   hours.per.week  27504 non-null  int64
 9   sex_Male        27504 non-null  int64
 10  income_>50K     27504 non-null  int64
dtypes: int64(11)
memory usage: 2.5 MB


In [23]:
df.shape

(27504, 11)

In [None]:
#es un sistema de recomendación basado en contenido y vamos a predecir la
# probabilidad de superar los 50k de ingresos, por eso utilizammos un Random Forest
X = df.drop(columns='income_>50K')
y = df['income_>50K']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
#simulamos un usuario
usuario = {
    'age': 25,
    'fnlwgt': 100000,
    'education.num': 9,
    'marital.status': 1, #Never-married
    'occupation': 5, #Other-service
    'race': 0, #White
    'capital.gain': 0,
    'capital.loss': 0,
    'hours.per.week': 20,
    'sex_Male': 1
}

In [34]:
usuario_df = pd.DataFrame([usuario])
probabilidad = model.predict_proba(usuario_df)[0][1]
print(f"Probabilidad que este usuario gane >50K: {probabilidad:.2f}")

Probabilidad que este usuario gane >50K: 0.00


In [None]:
#usuario 2
usuario['age'] = 55
usuario['education.num'] = 13
usuario['hours.per.week'] = 40
usuario['occupation'] = 0 #Prof-specialty
usuario['marital.status'] = 5 #Married-spouse-absent
usuario2 = pd.DataFrame([usuario])
probabilidad2 = model.predict_proba(usuario2)[0][1]
print(f"Probabilidad de ingresar >50k con usuario 2: {probabilidad2:.2f}")

Probabilidad de ingresar >50k con usuario 2: 0.42


In [43]:
#usuario 3
usuario['age'] = 46
usuario['education.num'] = 16
usuario['hours.per.week'] = 50
usuario['occupation'] = 0 #prof-specialty
usuario['marital.status'] = 0 #Married-civ-spouse
usuario['capital.gain'] = 10000 #este usuario sin esta característica, tiene la misma probabilidad
usuario3 = pd.DataFrame([usuario])
probabiidad3 = model.predict_proba(usuario3)[0][1]
print(f"Probabilidad de ingresar >50k con usuario 3: {probabiidad3:.2f}")

Probabilidad de ingresar >50k con usuario 3: 0.94


Con estos tres perfiles de ususario que se han hecho para el sistema de recomendación basado en contenido, vemos como la trayectoria educativa y laboral aumenta la probabilidad de ingresar más de 50k. Columnas como education.num, occupation, hours.per.week y sex_male son esenciales para determinar la probabilidad. 