In [1]:
import pandas as pd
import numpy as np

# **Estimativas de probabilidades**


## Leitura do conjunto de dados

In [2]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data")

In [3]:
df.columns = ["age","workclass","finalweight","education",
                 "education-num","martial-status","occupation",
                 "relationship","race","sex","capital-gain",
                 "capital-loss","hours-per-week ","native-country",
                 "income"]

for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].str.strip()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32560 non-null  int64 
 1   workclass        32560 non-null  object
 2   finalweight      32560 non-null  int64 
 3   education        32560 non-null  object
 4   education-num    32560 non-null  int64 
 5   martial-status   32560 non-null  object
 6   occupation       32560 non-null  object
 7   relationship     32560 non-null  object
 8   race             32560 non-null  object
 9   sex              32560 non-null  object
 10  capital-gain     32560 non-null  int64 
 11  capital-loss     32560 non-null  int64 
 12  hours-per-week   32560 non-null  int64 
 13  native-country   32560 non-null  object
 14  income           32560 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
df.head()

Unnamed: 0,age,workclass,finalweight,education,education-num,martial-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


###  Probabilidade de uma pessoa ter mais de 80 anos (ou seja, Age > 80).

$$
\Pr(\text{Age > 80})
$$

In [6]:
res = df[df.age > 80]
idadeMaior80 = len(res)
tamanho_espaco_amostral = len(df)
print(f"Probabilidade de uma pessoa ter mais de 80 anos: {idadeMaior80/tamanho_espaco_amostral}")

Probabilidade de uma pessoa ter mais de 80 anos: 0.0030405405405405407


###  Probabilidade de uma pessoa ter mais de 80 anos (ou seja, Age > 80) dado que tem Salary > 50000.

$$
\Pr(\text{Age > 80} \mid \text{Salary > 50000})
$$

In [7]:
res = df[df.income == '>50K']
salarioMaior50k = len(res)
idadeMaior80 = len(res[res['age'] > 80])
print(f"Probabilidade de uma pessoa ter mais de 80 anos dado que tem Salary > 50000 é de {idadeMaior80/salarioMaior50k}")

Probabilidade de uma pessoa ter mais de 80 anos dado que tem Salary > 50000 é de 0.0017854865450835354


###  Probabilidade de uma pessoa ter mais de 80 anos e salário maior que 50000.

$$
\Pr(\text{Age > 80}, \text{Salary > 50000})
$$

In [8]:
res = df[(df['income'] == '>50K') &
               (df['age'] > 80)]
tamanho_espaco_amostral = len(df)
idadeMaior80 = len(res)
print(f"Probabilidade de uma pessoa ter mais de 80 anos e salario > 50000: {idadeMaior80/tamanho_espaco_amostral}")

Probabilidade de uma pessoa ter mais de 80 anos e salario > 50000: 0.00042997542997542996


$$
\Pr(\text{Workclass = State-gov}, \text{Occupation = Adm-clerical}, \text{Sex = Male})
$$

In [9]:
res = df[(df['workclass'] == 'State-gov') &
               (df['occupation'] == 'Adm-clerical') &
               (df['sex'] == "Male")]

resultado = len(res)
print(f"Probabilidade: {resultado/tamanho_espaco_amostral}")

Probabilidade: 0.002794840294840295


### Pr(Workclass = Self-emp-inc, Occupation = Exec-managerial, Sex = Male)

In [10]:
res = df[(df['workclass'] == 'Self-emp-not-inc') &
               (df['occupation'] == 'Exec-managerial') &
               (df['sex'] == "Male")]
resultado = len(res)
print(f"Probabilidade: {resultado/tamanho_espaco_amostral}")

Probabilidade: 0.010288697788697789


## Divisão do conjunto de dados (treino/teste)

In [11]:
df.income.unique()

array(['<=50K', '>50K'], dtype=object)

In [12]:
df.shape

(32560, 15)

In [13]:
df_temp = df[df.income == "<=50K"]
df_temp.shape

(24719, 15)

In [14]:
df_temp.shape[0]/df.shape[0]

0.7591830466830467

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = df.drop('income', axis=1)
y = df.income

le = LabelEncoder()
le.fit(['<=50K', '>50K'])
print(list(le.classes_))
le.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0,
                                                    stratify=y)

['<=50K', '>50K']


In [16]:
X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape

((32560, 14), (32560,), (26048, 14), (26048,), (6512, 14), (6512,))

In [17]:
y[:20]

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
5     <=50K
6      >50K
7      >50K
8      >50K
9      >50K
10     >50K
11    <=50K
12    <=50K
13     >50K
14    <=50K
15    <=50K
16    <=50K
17    <=50K
18     >50K
19     >50K
Name: income, dtype: object

In [23]:
X_train[:5]

Unnamed: 0,age,workclass,finalweight,education,education-num,martial-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
29101,36,Private,109133,Some-college,10,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States
10606,25,Private,204219,HS-grad,9,Never-married,Adm-clerical,Unmarried,White,Female,0,0,40,Mexico
9414,26,Self-emp-not-inc,318644,Prof-school,15,Never-married,Prof-specialty,Own-child,White,Male,0,0,20,United-States
1747,27,Private,219371,HS-grad,9,Never-married,Other-service,Not-in-family,White,Female,0,0,35,United-States
10677,42,Self-emp-not-inc,444134,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,15,United-States


In [19]:
y[:5]

0    <=50K
1    <=50K
2    <=50K
3    <=50K
4    <=50K
Name: income, dtype: object

In [20]:
y_train[:5]

29101    <=50K
10606    <=50K
9414     <=50K
1747     <=50K
10677    <=50K
Name: income, dtype: object

In [21]:
list(le.inverse_transform([0, 0, 1]))

['<=50K', '<=50K', '>50K']

# Referências

- [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)

- [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
