| Column                   | Description                                                                      |
|------------------------- |--------------------------------------------------------------------------------- |
| `student_id`             | A unique ID for each student.                                                    |
| `city`                   | A code for the city the student lives in.                                        |
| `city_development_index` | A scaled development index for the city.                                         |
| `gender`                 | The student's gender.                                                            |
| `relevant_experience`    | An indicator of the student's work relevant experience.                          |
| `enrolled_university`    | The type of university course enrolled in (if any).                              |
| `education_level`        | The student's education level.                                                   |
| `major_discipline`       | The educational discipline of the student.                                       |
| `experience`             | The student's total work experience (in years).                                  |
| `company_size`           | The number of employees at the student's current employer.                       |
| `company_type`           | The type of company employing the student.                                       |
| `last_new_job`           | The number of years between the student's current and previous jobs.             |
| `training_hours`         | The number of hours of training completed.                                       |
| `job_change`             | An indicator of whether the student is looking for a new job (`1`) or not (`0`). |

In [39]:
import pandas as pd
import numpy as np

ds_jobs = pd.read_csv("customer_train.csv")
ds_jobs.head()

Unnamed: 0,student_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,job_change
0,8949,city_103,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevant experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevant experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevant experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevant experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


## Transformando tipos de dados do dataset


In [40]:
ds_jobs_transformed = ds_jobs.copy()

In [41]:
ds_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   student_id              19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevant_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  job_change              19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

Transforma em categorias ordenadas

In [42]:
ds_jobs_transformed["enrolled_university"] = pd.Categorical(ds_jobs_transformed["enrolled_university"], categories = ['no_enrollment', 'Part time course', 'Full time course'], ordered=True)
ds_jobs_transformed["enrolled_university"].cat.categories

Index(['no_enrollment', 'Part time course', 'Full time course'], dtype='object')

In [43]:
ds_jobs_transformed["education_level"] = pd.Categorical(ds_jobs_transformed["education_level"], categories = ['Primary School', 'High School','Graduate','Masters', 'Phd'], ordered=True)
ds_jobs_transformed["education_level"].cat.categories

Index(['Primary School', 'High School', 'Graduate', 'Masters', 'Phd'], dtype='object')

In [44]:
ds_jobs_transformed["experience"] = pd.Categorical(ds_jobs_transformed["experience"], categories = ['<1','1', '2','3','4','5','6','7', '8', '9','10', '11', '12', '13', '14', '15', '16', '17', '18', '19','20','>20'], ordered=True)
ds_jobs_transformed["experience"].cat.categories

Index(['<1', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '>20'],
      dtype='object')

In [45]:
ds_jobs_transformed["company_size"] = pd.Categorical(ds_jobs_transformed["company_size"], categories = ['<10', '10-49', '50-99','100-499', '500-999', '1000-4999', '5000-9999', '10000+'], ordered=True)
ds_jobs_transformed["company_size"].cat.categories

Index(['<10', '10-49', '50-99', '100-499', '500-999', '1000-4999', '5000-9999',
       '10000+'],
      dtype='object')

In [46]:
ds_jobs_transformed["last_new_job"] = pd.Categorical(ds_jobs_transformed["last_new_job"], categories = ['never', '1', '2', '3', '4', '>4'], ordered=True)
ds_jobs_transformed["last_new_job"].cat.categories

Index(['never', '1', '2', '3', '4', '>4'], dtype='object')

Transforma em categorias sem ordem

In [47]:
# Verifica as colunas a transformar
for column in ds_jobs_transformed.columns:
    if ds_jobs_transformed[column].dtype == object:
        print(ds_jobs_transformed[column].describe())

count        19158
unique         123
top       city_103
freq          4355
Name: city, dtype: object
count     14650
unique        3
top        Male
freq      13221
Name: gender, dtype: object
count                       19158
unique                          2
top       Has relevant experience
freq                        13792
Name: relevant_experience, dtype: object
count     16345
unique        6
top        STEM
freq      14492
Name: major_discipline, dtype: object
count       13018
unique          6
top       Pvt Ltd
freq         9817
Name: company_type, dtype: object


In [48]:
# Transforma as colunas em categorias sem ordem
for column in ds_jobs_transformed.columns:
    if ds_jobs_transformed[column].dtype == object:
        ds_jobs_transformed[column] = ds_jobs_transformed[column].astype("category")

Transforma em Int32

In [49]:
ds_jobs_transformed[["student_id", "training_hours"]] = ds_jobs_transformed[["student_id", "training_hours"]].astype(np.int32)
ds_jobs_transformed[["student_id", "training_hours"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   student_id      19158 non-null  int32
 1   training_hours  19158 non-null  int32
dtypes: int32(2)
memory usage: 149.8 KB


Transforma em booleano

In [50]:
# Tranformação simples
ds_jobs_transformed["job_change"] =ds_jobs_transformed["job_change"].astype(bool)
ds_jobs_transformed["job_change"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 19158 entries, 0 to 19157
Series name: job_change
Non-Null Count  Dtype
--------------  -----
19158 non-null  bool 
dtypes: bool(1)
memory usage: 18.8 KB


In [51]:
# Transformação de string para booleano
mapeamento = {"No relevant experience": False, "Has relevant experience": True}
ds_jobs_transformed["relevant_experience"] = ds_jobs_transformed["relevant_experience"].replace(mapeamento)
ds_jobs_transformed["relevant_experience"] = ds_jobs_transformed["relevant_experience"].astype(bool)
ds_jobs_transformed["relevant_experience"].describe()

count     19158
unique        2
top        True
freq      13792
Name: relevant_experience, dtype: object

Transforma em Float16

In [52]:
ds_jobs_transformed["city_development_index"] = ds_jobs_transformed["city_development_index"].astype(np.float16)
ds_jobs_transformed["city_development_index"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 19158 entries, 0 to 19157
Series name: city_development_index
Non-Null Count  Dtype  
--------------  -----  
19158 non-null  float16
dtypes: float16(1)
memory usage: 37.5 KB


In [53]:
ds_jobs_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   student_id              19158 non-null  int32   
 1   city                    19158 non-null  category
 2   city_development_index  19158 non-null  float16 
 3   gender                  14650 non-null  category
 4   relevant_experience     19158 non-null  bool    
 5   enrolled_university     18772 non-null  category
 6   education_level         18698 non-null  category
 7   major_discipline        16345 non-null  category
 8   experience              19093 non-null  category
 9   company_size            13220 non-null  category
 10  company_type            13018 non-null  category
 11  last_new_job            18735 non-null  category
 12  training_hours          19158 non-null  int32   
 13  job_change              19158 non-null  bool    
dtypes: bool(2), category(9

Atingido uma redução de uso de memória de 2.0+MB para 400.2KB

## Filtrando o dataset


Aplicando filtros: Estudantes 10+ anos de experiência e Empresas 1000+ funcionários

In [54]:
ds_jobs_transformed = ds_jobs_transformed.loc[(ds_jobs_transformed["experience"]>='10') & (ds_jobs_transformed["company_size"]>= '1000-4999')]
ds_jobs_transformed[["company_type", "experience"]].value_counts()

company_type   experience
Pvt Ltd        >20           692
               10            200
               15            145
               11            127
               14            120
Public Sector  >20           104
Pvt Ltd        16            101
               12             87
               13             82
               17             78
               19             73
               18             61
               20             41
NGO            >20            39
Public Sector  10             25
               14             16
               13             15
               16             14
               12             13
               11             11
Other          >20            10
Public Sector  15              9
NGO            16              8
               10              8
Public Sector  17              7
               19              7
NGO            14              6
               15              6
               13              5
Public Sector  20

In [55]:
ds_jobs_transformed.shape

(2201, 14)

In [56]:
ds_jobs_transformed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2201 entries, 9 to 19143
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   student_id              2201 non-null   int32   
 1   city                    2201 non-null   category
 2   city_development_index  2201 non-null   float16 
 3   gender                  1821 non-null   category
 4   relevant_experience     2201 non-null   bool    
 5   enrolled_university     2185 non-null   category
 6   education_level         2184 non-null   category
 7   major_discipline        2097 non-null   category
 8   experience              2201 non-null   category
 9   company_size            2201 non-null   category
 10  company_type            2144 non-null   category
 11  last_new_job            2184 non-null   category
 12  training_hours          2201 non-null   int32   
 13  job_change              2201 non-null   bool    
dtypes: bool(2), category(9), flo

In [57]:
ds_jobs_transformed

Unnamed: 0,student_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,job_change
9,699,city_103,0.919922,,True,no_enrollment,Graduate,STEM,17,10000+,Pvt Ltd,>4,123,False
12,25619,city_61,0.913086,Male,True,no_enrollment,Graduate,STEM,>20,1000-4999,Pvt Ltd,3,23,False
31,22293,city_103,0.919922,Male,True,Part time course,Graduate,STEM,19,5000-9999,Pvt Ltd,>4,141,False
34,26494,city_16,0.910156,Male,True,no_enrollment,Graduate,Business Degree,12,5000-9999,Pvt Ltd,3,145,False
40,2547,city_114,0.925781,Female,True,Full time course,Masters,STEM,16,1000-4999,Public Sector,2,14,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19097,25447,city_103,0.919922,Male,True,no_enrollment,Graduate,STEM,>20,1000-4999,Pvt Ltd,>4,57,False
19101,6803,city_16,0.910156,Male,True,no_enrollment,High School,,10,10000+,Pvt Ltd,1,89,False
19103,32932,city_10,0.895020,Male,True,Part time course,Masters,Other,>20,1000-4999,Pvt Ltd,>4,18,False
19128,3365,city_16,0.910156,,True,no_enrollment,Graduate,Humanities,>20,1000-4999,Pvt Ltd,>4,23,False
