# Fusion des datasets annuels et cleaning

Années à fusionner :
- 2017
- 2018
- 2019
- 2020

## Import des librairies

In [1]:
import pandas as pd
import numpy as np
import re
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer

## Lecture et concaténation des datasets annuels

In [2]:
df1 = pd.read_csv("./Data/dataset-2017.csv")
df2 = pd.read_csv("./Data/dataset-2018.csv")
df3 = pd.read_csv("./Data/dataset-2019.csv")
df4 = pd.read_csv("./Data/dataset-2020.csv")

In [3]:
df = pd.concat([df1,df2,df3,df4])

In [4]:
df.head(20)

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,site,facturation,entrée_clt,sortie_clt,secteur,forme
0,AAGS2017010396900,AGS,2017.0,5782,2.0,JOP,['029'],['AUTRES TRAVAUX EN SOCIAL'],43.0,5231.25,PITHIVIERS,4550.0,0,1,Fabrication de charpentes et d'autres menuiseries,SAS
1,AAGS2017010397000,AGS,2017.0,6565,1.0,FRL,['001'],['RENSEIGNEMENTS CLIENT'],1.0,68.0,GIEN,750.0,0,0,Activités d'architecture,EURL
2,AAGS2017010534300,AGS,2017.0,6924,2.0,JOP,"['029', '024', '001']","['AUTRES TRAVAUX EN SOCIAL', 'FORMALITES DE LI...",31.25,3868.0,PITHIVIERS,4804.2,0,0,Fabrication de meubles de bureau et de magasin,SAS
3,ACAB2017009616200,CAB,2017.0,9230,12.0,NON FACTURABLE,"['081', '096', '091', '079', '078', '082', '07...","['COMPTABILITE GENERALE CABINET', 'DOCUMENTATI...",2049.5,222874.75,NON FACTURABLE,0.0,0,0,Divers,Particulier
4,ACAB2017009616300,CAB,2017.0,9000,98.0,NON FACTURABLE,"['083', '082', '079', '096', '075', '010', '09...","['NON FACTURABLE SOCIAL', 'NON FACTURABLE COMP...",2279.64,151143.13,NON FACTURABLE,0.0,0,0,Divers,Particulier
5,ACAB2017009616400,CAB,2017.0,9010,16.0,NON FACTURABLE,"['086', '088', '082', '076', '072', '098', '089']","['VERIFICATION FINALE TRAVAUX', 'ARCHIVAGE DOS...",509.8,61462.5,NON FACTURABLE,0.0,0,0,Divers,Particulier
6,ACAB2017009616500,CAB,2017.0,9020,58.0,NON FACTURABLE,"['092', '083', '089', '096', '082', '079', '08...","['FORMATION PROFESSIONNELLE INTERNE', 'NON FAC...",4958.35,354287.62,NON FACTURABLE,0.0,0,0,Divers,Divers
7,ACAB2017009616600,CAB,2017.0,9050,18.0,NON FACTURABLE,"['084', '077', '061', '079', '076', '089', '08...","['NON FACTURABLE JURIDIQUE', 'SECRETARIAT CABI...",726.25,62863.25,NON FACTURABLE,0.0,0,0,Divers,Particulier
8,ACAB2017009616700,CAB,2017.0,9060,3.0,NON FACTURABLE,['085'],['NON FACTURABLE COMMISSARIAT'],20.75,1377.5,NON FACTURABLE,0.0,0,0,Divers,Particulier
9,ACAB2017009616800,CAB,2017.0,9070,101.0,NON FACTURABLE,"['088', '089', '077', '082', '079', '075', '08...","['ARCHIVAGE DOSSIERS CLASSEMENT', 'ASSISTANCE ...",1516.4,91647.65,NON FACTURABLE,0.0,0,0,Divers,Particulier


In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35583 entries, 0 to 35582
Columns: 197 entries, code_mission to snc
dtypes: float64(4), int64(183), object(10)
memory usage: 53.5+ MB


### Le dataset final se compose de 35 583 lignes et 197 colonnes

## Vérification des valeurs nulles

In [6]:
df.isna().sum()

code_mission          0
type_mission          0
exercice              0
code_client           0
total_assistant       0
manager               0
code_article          0
libelle_article       0
temps                 0
valorisation_temps    0
site                  0
facturation           0
entrée_clt            0
sortie_clt            0
secteur               0
forme                 0
dtype: int64

In [7]:
df.isnull().sum()

code_mission          0
type_mission          0
exercice              0
code_client           0
total_assistant       0
manager               0
code_article          0
libelle_article       0
temps                 0
valorisation_temps    0
site                  0
facturation           0
entrée_clt            0
sortie_clt            0
secteur               0
forme                 0
dtype: int64

## Données catégorielles

In [8]:
df.dtypes

code_mission           object
type_mission           object
exercice              float64
code_client            object
total_assistant       float64
manager                object
code_article           object
libelle_article        object
temps                 float64
valorisation_temps    float64
site                   object
facturation           float64
entrée_clt              int64
sortie_clt              int64
secteur                object
forme                  object
dtype: object

Colonnes catégorielles :
- 'code_mission'
- 'type_mission'
- 'code_client'
- 'manager'
- 'code_article'
- 'libelle_article'
- 'site'
- 'secteur'
- 'forme'

In [9]:
#changer 'exercice' en catégorielle

In [10]:
#Le code mission est bien un identifiant unique. Il y a 35 740 missions réalisées depuis 2017 à 2020.
df.code_mission.value_counts().sort_values(ascending=True)

ASOP2019012303200    1
AGES2018011613000    1
APOL2018011475000    1
ASOP2017010291600    1
ACOM2017012086800    1
                    ..
AJUR2018008934500    1
ACOM2018010563200    1
ASOP2018011439200    1
ASOP2019012196200    1
ACOM2019009954200    1
Name: code_mission, Length: 35739, dtype: int64

In [11]:
df.type_mission.value_counts()

COM    14580
JUR     7186
SOP     4924
GEJ     2947
GS0     1282
SAP      902
GES      888
GE0      843
GS3      500
GS1      467
POL      327
GEP      293
CAC      219
CAB      141
GE1       53
GJ0       49
GS2       29
GS4       26
TB        25
GJ1       22
AGS       10
GE2        7
GEC        3
CAT        3
GE3        3
GJ2        3
GJ3        2
GJ5        1
GJ4        1
GJ6        1
CAP        1
GJ7        1
Name: type_mission, dtype: int64

- Service Commissaire Aux Comptes : 
    CAC
    CAP
    CAT
- Service Juridique :
    JUR
    GEJ
- Service Paie :
    GEP
    POL
    SOP
- Service social :
    AGS
    GES
    GS0
    GS1
    GS2
    GS3
    GS4

In [12]:
df.code_client.value_counts()

77254    29
4794     28
7149     27
3295     26
7402     26
         ..
A3204     1
A2324     1
A3535     1
A2516     1
A2558     1
Name: code_client, Length: 6403, dtype: int64

In [13]:
df.manager.value_counts()

SLE                      5502
DAC                      4504
GSO                      4103
VAL                      3479
FRL                      3436
EMG                      3370
NAB                      2979
JOP                      2792
THB                      1706
CBA                      1299
STL                      1212
ALD                       377
JOD                       292
CIT                       259
_OLP NE PLUS UTILISER     182
CHB                       119
NON FACTURABLE             80
_FAY NE PLUS UTILISER      42
DIVERS                      5
CAV                         1
Name: manager, dtype: int64

### 19 managers associés dans ce dataset

In [14]:
#Il y a 11 394 listes d'activité sur l'ensemble des missions
df.libelle_article.value_counts().head()

['SECRETARIAT JURIDIQUE ANNUEL']                        3286
['DIVERS JURIDIQUE']                                    1442
['CONTRATS DE TRAVAIL']                                 1262
['AUTRES DECLARATIONS FISCALES']                         931
['DIVERS JURIDIQUE', 'SECRETARIAT JURIDIQUE ANNUEL']     865
Name: libelle_article, dtype: int64

In [15]:
#Quand le site = CLIENT PARTI, cela signifie que le client a quitté le cabinet avant 2017
df[df['site'] == "CLIENT PARTI"]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,site,facturation,entrée_clt,sortie_clt,secteur,forme
193,ACOM2017008083600,COM,2017.0,79402,1.0,GSO,['016'],['AUTRES DECLARATIONS FISCALES'],0.25,16.50,CLIENT PARTI,7160.0,0,0,Services funéraires,SARL
791,ACOM2017008292000,COM,2017.0,5711,1.0,GSO,['060'],['SECRETARIAT JURIDIQUE ANNUEL'],0.50,34.00,CLIENT PARTI,0.0,0,0,Location de terrains et d'autres biens immobil...,SCI
841,ACOM2017008299400,COM,2017.0,5985,1.0,DAC,['060'],['SECRETARIAT JURIDIQUE ANNUEL'],0.25,18.50,CLIENT PARTI,0.0,0,0,Divers,SCI
930,ACOM2017008310700,COM,2017.0,2395,1.0,FRL,['021'],['DECLARATIONS MENSUELLES / TRIMESTRIELLES'],0.50,28.00,CLIENT PARTI,0.0,0,0,Autres commerces de détail sur éventaires et m...,IND
1104,ACOM2017008333700,COM,2017.0,6834,1.0,FRL,['020'],['FICHE DE PAIE'],0.50,28.00,CLIENT PARTI,0.0,0,0,Divers,IND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5828,AJUR2019010696700,JUR,2019.0,7340,1.0,EMG,['060'],['SECRETARIAT JURIDIQUE ANNUEL'],0.50,31.50,CLIENT PARTI,0.0,0,0,Coiffure,SARL
7356,AJUR2019011794600,JUR,2019.0,6448,3.0,SLE,"['069', '060', 69]","['DIVERS JURIDIQUE', 'SECRETARIAT JURIDIQUE AN...",3.25,203.25,CLIENT PARTI,180.0,0,0,Location de terrains et d'autres biens immobil...,SCI
8250,ASOP2019012251400,SOP,2019.0,7340,1.0,EMG,"['F0217', 'F02171', 'F02081', '020', '029']","['FORFAIT PRELEVEMENT A LA SOURCE ENTREPRISE',...",4.00,226.00,CLIENT PARTI,170.0,0,0,Coiffure,SARL
4399,AGEJ2020013450900,GEJ,2020.0,6448,1.0,SLE,['069'],['DIVERS JURIDIQUE'],3.00,229.53,CLIENT PARTI,75.0,0,0,Location de terrains et d'autres biens immobil...,SCI


In [16]:
df.site.value_counts()

GIEN                    6249
OLIVET                  6050
PITHIVIERS              4244
ORLEANS                 3181
VILLEMANDEUR            2766
BRIARE                  2401
MEUNG SUR LOIRE         2284
FONTAINEBLEAU           2187
LA FERTE ST AUBIN       2119
YERRES                  1619
ANGERVILLE              1059
EVRY                     814
CLIENT PARTI             226
CAC                      224
VILLENEUVE LA GUYARD     127
NON FACTURABLE            86
DOSSIERS INTERNES         58
MORSANG SUR ORGE          38
MALESHERBES                5
DIVERS                     2
Name: site, dtype: int64

In [17]:
#Le nombre de missions augmente chaque année
df.exercice.value_counts()

2020.0    9926
2018.0    9019
2019.0    8875
2017.0    7919
Name: exercice, dtype: int64

## Convertir la colonne "exercice" en catégorielle

In [18]:
df.exercice = df.exercice.astype(int).astype(str)

In [19]:
df.dtypes

code_mission           object
type_mission           object
exercice               object
code_client            object
total_assistant       float64
manager                object
code_article           object
libelle_article        object
temps                 float64
valorisation_temps    float64
site                   object
facturation           float64
entrée_clt              int64
sortie_clt              int64
secteur                object
forme                  object
dtype: object

In [20]:
df.head()

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,site,facturation,entrée_clt,sortie_clt,secteur,forme
0,AAGS2017010396900,AGS,2017,5782,2.0,JOP,['029'],['AUTRES TRAVAUX EN SOCIAL'],43.0,5231.25,PITHIVIERS,4550.0,0,1,Fabrication de charpentes et d'autres menuiseries,SAS
1,AAGS2017010397000,AGS,2017,6565,1.0,FRL,['001'],['RENSEIGNEMENTS CLIENT'],1.0,68.0,GIEN,750.0,0,0,Activités d'architecture,EURL
2,AAGS2017010534300,AGS,2017,6924,2.0,JOP,"['029', '024', '001']","['AUTRES TRAVAUX EN SOCIAL', 'FORMALITES DE LI...",31.25,3868.0,PITHIVIERS,4804.2,0,0,Fabrication de meubles de bureau et de magasin,SAS
3,ACAB2017009616200,CAB,2017,9230,12.0,NON FACTURABLE,"['081', '096', '091', '079', '078', '082', '07...","['COMPTABILITE GENERALE CABINET', 'DOCUMENTATI...",2049.5,222874.75,NON FACTURABLE,0.0,0,0,Divers,Particulier
4,ACAB2017009616300,CAB,2017,9000,98.0,NON FACTURABLE,"['083', '082', '079', '096', '075', '010', '09...","['NON FACTURABLE SOCIAL', 'NON FACTURABLE COMP...",2279.64,151143.13,NON FACTURABLE,0.0,0,0,Divers,Particulier


In [21]:
#Supprimer les activités liées au cabinet comptable, inutile au ML

In [22]:
list_index_cab = df[df['type_mission'] == "CAB"].index
len(list_index_cab)

141

In [23]:
df.drop(list_index_cab, axis=0, inplace=True)

## Bag of Words sur les types de missions

In [24]:
## Création d'un "bag of words" pour analyser les données des types de missions

In [25]:
text_data = np.array(df['type_mission'].values)
text_data

array(['AGS', 'AGS', 'COM', ..., 'TB', 'TB', 'TB'], dtype=object)

In [26]:
# Create the bag of words feature matrix
count = CountVectorizer()
bow = count.fit_transform(text_data)

# Show feature matrix
bow.toarray()

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)

In [27]:
feature_names = count.get_feature_names()

In [28]:
df_types = pd.DataFrame(bow.toarray(), columns=feature_names)
df_types

Unnamed: 0,ags,cac,cap,cat,com,ge0,ge1,ge2,ge3,gec,...,gs0,gs1,gs2,gs3,gs4,jur,pol,sap,sop,tb
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35578,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35581,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [29]:
#df.head()

In [30]:
df = df.reset_index().drop('index', axis=1)

In [31]:
df = pd.concat([df, df_types], axis=1)

In [32]:
df.head()

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,gs0,gs1,gs2,gs3,gs4,jur,pol,sap,sop,tb
0,AAGS2017010396900,AGS,2017,5782,2.0,JOP,['029'],['AUTRES TRAVAUX EN SOCIAL'],43.0,5231.25,...,0,0,0,0,0,0,0,0,0,0
1,AAGS2017010397000,AGS,2017,6565,1.0,FRL,['001'],['RENSEIGNEMENTS CLIENT'],1.0,68.0,...,0,0,0,0,0,0,0,0,0,0
2,ACOM2017007712400,COM,2017,6635,4.0,GSO,"['010', '012', '001', '016', '011', '019', '013']","['SAISIE ET CENTRALISATION', 'DECLARATIONS T.V...",22.75,1372.5,...,0,0,0,0,0,0,0,0,0,0
3,ACOM2017007718600,COM,2017,6718,1.0,DAC,"['011', '013']","['CONTROLE ET REVISION', 'COMPTES ANNUELS ET ...",7.5,555.0,...,0,0,0,0,0,0,0,0,0,0
4,ACOM2017007719900,COM,2017,6720,3.0,VAL,"['016', '001', '011', '013', '019']","['AUTRES DECLARATIONS FISCALES', 'RENSEIGNEMEN...",17.75,1168.75,...,0,0,0,0,0,0,0,0,0,0


## Bag of Words sur les activités

In [33]:
## Création d'un "bag of words" pour analyser les données des libellés activités

In [34]:
def clean_act(s):
    s_lower = s.lower()
    s_sans_chiffre = re.sub("[0-9]","", s_lower)
    s_sans_par = re.sub(r'\([^)]*\)', '', s_sans_chiffre)
    s_sanscarac = re.sub('[^a-zA-Z\s,]', '', s_sans_par)
    return s_sanscarac

In [35]:
df["libelle_article"] = df.libelle_article.apply(clean_act)

In [36]:
df.head()

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,gs0,gs1,gs2,gs3,gs4,jur,pol,sap,sop,tb
0,AAGS2017010396900,AGS,2017,5782,2.0,JOP,['029'],autres travaux en social,43.0,5231.25,...,0,0,0,0,0,0,0,0,0,0
1,AAGS2017010397000,AGS,2017,6565,1.0,FRL,['001'],renseignements client,1.0,68.0,...,0,0,0,0,0,0,0,0,0,0
2,ACOM2017007712400,COM,2017,6635,4.0,GSO,"['010', '012', '001', '016', '011', '019', '013']","saisie et centralisation, declarations tva, re...",22.75,1372.5,...,0,0,0,0,0,0,0,0,0,0
3,ACOM2017007718600,COM,2017,6718,1.0,DAC,"['011', '013']","controle et revision, comptes annuels et decl...",7.5,555.0,...,0,0,0,0,0,0,0,0,0,0
4,ACOM2017007719900,COM,2017,6720,3.0,VAL,"['016', '001', '011', '013', '019']","autres declarations fiscales, renseignements c...",17.75,1168.75,...,0,0,0,0,0,0,0,0,0,0


In [37]:
text_act = np.array(df.libelle_article.values)
text_act

array(['autres travaux en social', 'renseignements client',
       'saisie et centralisation, declarations tva, renseignements client, autres declarations fiscales, controle et revision, presentation des comptes annuels, comptes annuels  et declarations fiscales',
       ..., 'tableau de bord', 'tableau de bord', 'tableau de bord'],
      dtype=object)

In [38]:
# Create the bag of words feature matrix
count = CountVectorizer(analyzer='word', tokenizer=lambda x: x.split(', '))

bow_act = count.fit_transform(text_act)

# Show feature matrix
bow_act.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [39]:
feature_names = count.get_feature_names()

In [40]:
df_act = pd.DataFrame(bow_act.toarray(), columns=feature_names)
df_act

Unnamed: 0,accompagnement dans le suivi de vos travaux,accompagnement fonction daf,actualisation mensuelle aide a lembauche apprenti,aide a lembauche apprenti,animation et formatioin,assistance au controle fiscal,assistance au controle urssaf,assistance commissaire aux comptes,assistance informatique client,assistance telephonique a lheure,...,secretariat juridique annuel sceaearl,secretariat juridique annuel scmsc,secrtariat juridique annuel sci sc portefeuille,situation intermediaire,tableau de bord,traitement activite partielle sur bulletin de paie,transmission de lentreprise,travaux de fin de mission,travaux exceptionnels covd,verification finale travaux
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35578,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
35579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
35580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
35581,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [41]:
df_act.columns

Index(['accompagnement dans le suivi de vos travaux',
       'accompagnement fonction daf',
       'actualisation mensuelle aide a lembauche  apprenti',
       'aide a lembauche  apprenti', 'animation et formatioin',
       'assistance au controle fiscal', 'assistance au controle urssaf',
       'assistance commissaire aux comptes', 'assistance informatique client',
       'assistance telephonique a lheure',
       ...
       'secretariat juridique annuel sceaearl',
       'secretariat juridique annuel scmsc',
       'secrtariat juridique annuel sci  sc portefeuille',
       'situation intermediaire', 'tableau de bord',
       'traitement activite partielle sur bulletin de paie',
       'transmission de lentreprise', 'travaux de fin de mission',
       'travaux exceptionnels covd', 'verification finale travaux'],
      dtype='object', length=173)

In [42]:
df = pd.concat([df, df_act], axis=1)

In [43]:
df.head()

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,secretariat juridique annuel sceaearl,secretariat juridique annuel scmsc,secrtariat juridique annuel sci sc portefeuille,situation intermediaire,tableau de bord,traitement activite partielle sur bulletin de paie,transmission de lentreprise,travaux de fin de mission,travaux exceptionnels covd,verification finale travaux
0,AAGS2017010396900,AGS,2017,5782,2.0,JOP,['029'],autres travaux en social,43.0,5231.25,...,0,0,0,0,0,0,0,0,0,0
1,AAGS2017010397000,AGS,2017,6565,1.0,FRL,['001'],renseignements client,1.0,68.0,...,0,0,0,0,0,0,0,0,0,0
2,ACOM2017007712400,COM,2017,6635,4.0,GSO,"['010', '012', '001', '016', '011', '019', '013']","saisie et centralisation, declarations tva, re...",22.75,1372.5,...,0,0,0,0,0,0,0,0,0,0
3,ACOM2017007718600,COM,2017,6718,1.0,DAC,"['011', '013']","controle et revision, comptes annuels et decl...",7.5,555.0,...,0,0,0,0,0,0,0,0,0,0
4,ACOM2017007719900,COM,2017,6720,3.0,VAL,"['016', '001', '011', '013', '019']","autres declarations fiscales, renseignements c...",17.75,1168.75,...,0,0,0,0,0,0,0,0,0,0


In [44]:
df.describe()

Unnamed: 0,total_assistant,temps,valorisation_temps,facturation,entrée_clt,sortie_clt,ags,cac,cap,cat,...,secretariat juridique annuel sceaearl,secretariat juridique annuel scmsc,secrtariat juridique annuel sci sc portefeuille,situation intermediaire,tableau de bord,traitement activite partielle sur bulletin de paie,transmission de lentreprise,travaux de fin de mission,travaux exceptionnels covd,verification finale travaux
count,35583.0,35583.0,35583.0,35583.0,35583.0,35583.0,35583.0,35583.0,35583.0,35583.0,...,35583.0,35583.0,35583.0,35583.0,35583.0,35583.0,35583.0,35583.0,35583.0,35583.0
mean,3.544642,45.219825,1656.745699,1595.180793,0.089874,0.077397,0.000225,0.006098,2.8e-05,8.4e-05,...,0.000393,0.000365,0.015822,0.014895,0.008909,0.020544,0.006127,0.005733,0.073659,2.8e-05
std,2.079425,570.99293,2666.712615,3496.031262,0.286006,0.267223,0.014993,0.077855,0.005301,0.009182,...,0.019832,0.019111,0.124789,0.121133,0.09545,0.141852,0.078033,0.075501,0.261218,0.005301
min,1.0,0.0,-1.5,-600.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,3.5,261.5,360.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,10.0,736.5,670.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,33.25,2036.2,1905.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,25.0,36328.5,91573.75,383660.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0


In [45]:
#df.columns[46:96]

In [46]:
#df.columns[96:146]

In [47]:
#df.columns[146:196]

In [48]:
#df.columns[196:]

In [49]:
#52 lignes comprennent de la sous-traitance
df[df['saisie primo bpo'] == 1]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,secretariat juridique annuel sceaearl,secretariat juridique annuel scmsc,secrtariat juridique annuel sci sc portefeuille,situation intermediaire,tableau de bord,traitement activite partielle sur bulletin de paie,transmission de lentreprise,travaux de fin de mission,travaux exceptionnels covd,verification finale travaux
40,ACOM2017008059000,COM,2017,40082,4.0,DAC,"['011', '012', '013', '010', '016', '210', '21...","controle et revision, declarations tva, compte...",4969.25,5067.46,...,0,0,0,0,0,0,0,0,0,0
279,ACOM2017008223100,COM,2017,6129,3.0,DAC,"['210', '212', '211', '010', '012', '013', '01...","saisie primo bpo, saisie lignes importes, hber...",7875.0,4536.99,...,0,0,0,0,0,0,0,0,0,0
312,ACOM2017008227400,COM,2017,40032,5.0,DAC,"['012', '010', '210', '211', '016', '011', '00...","declarations tva, saisie et centralisation, sa...",8155.75,5370.65,...,0,0,0,0,0,0,0,0,0,0
316,ACOM2017008227900,COM,2017,40066,7.0,JOP,"['012', '016', '001', '010', '210', '211', '01...","declarations tva, autres declarations fiscales...",7531.5,6106.62,...,0,0,0,0,0,0,0,0,0,0
465,ACOM2017008247900,COM,2017,5954,3.0,DAC,"['210', '211', '001', '011', '010']","saisie primo bpo, hbergement pices comptables,...",250.75,284.47,...,0,0,0,0,0,0,0,0,0,0
467,ACOM2017008248100,COM,2017,5956,7.0,DAC,"['011', '010', '012', '210', '211', '212', '04...","controle et revision, saisie et centralisation...",3865.0,2301.46,...,0,0,0,0,0,0,0,0,0,0
473,ACOM2017008249100,COM,2017,6139,5.0,DAC,"['012', '010', '210', '212', '211', '011', '01...","declarations tva, saisie et centralisation, sa...",4631.0,3208.19,...,0,0,0,0,0,0,0,0,0,0
1196,ACOM2017008367900,COM,2017,6944,3.0,DAC,"['010', '212', '211', '210', '012', '011', '013']","saisie et centralisation, saisie lignes import...",12942.75,9254.02,...,0,0,0,0,0,0,0,0,0,0
1227,ACOM2017008488000,COM,2017,6988,4.0,DAC,"['012', '010', '210', '211', '011', '019', '013']","declarations tva, saisie et centralisation, sa...",397.75,1850.36,...,0,0,0,0,0,0,0,0,0,0
1332,ACOM2017008594000,COM,2017,7135,7.0,DAC,"['012', '043', '211', '210', '001', '212', '01...","declarations tva, tableau de bord, hbergement ...",12177.75,6982.14,...,0,0,0,0,1,0,0,0,0,0


### Création de familles d'activités pour réduire le nombre de colonnes

In [50]:
#Création famille assistance téléphonique
df['Assistance téléphonique-hotline'] =  df['assistance telephonique a lheure'] + df['assistance tlphonique  lheure'] + df['assistance tlphonique annuelle '] + df['forfait assistance telephonique'] + df['hotline utilisation logiciel grh']


In [51]:
#Remplacer les autres valeurs par 1 ou 0
df['Assistance téléphonique-hotline'] = df['Assistance téléphonique-hotline'].map({0:0, 1:1, 2:1, 3:1})

In [52]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['assistance telephonique a lheure', 'assistance tlphonique  lheure', 'assistance tlphonique annuelle ', 'forfait assistance telephonique', 'hotline utilisation logiciel grh'], axis=1, inplace=True)

In [53]:
#Création famille Autres travaux exceptionnels
df['Autres travaux exceptionnels'] =  df['autres travaux'] + df['autres travaux exceptionnels']

In [54]:
df[(df['Autres travaux exceptionnels'] != 1) & (df['Autres travaux exceptionnels'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,secrtariat juridique annuel sci sc portefeuille,situation intermediaire,tableau de bord,traitement activite partielle sur bulletin de paie,transmission de lentreprise,travaux de fin de mission,travaux exceptionnels covd,verification finale travaux,Assistance téléphonique-hotline,Autres travaux exceptionnels


In [55]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['autres travaux', 'autres travaux exceptionnels'], axis=1, inplace=True)

In [56]:
#Création famille gestion paie
df['Gestion bulletins de paie'] =  df['bulletin de paie'] + df['bulletin de paie apprenti'] + df['bulletin de paie cddmoisextrasaisonnier'] + df['bulletin de paie clarifie'] + df['bulletin de paie refait'] + df['bulletin de paie simulation'] + df['bulletin de paie suite prudhomme'] + df['fiche de paie'] + df['remontee des bulletins de paie'] + df['remontee des bulletins de salaire existants avec modifications'] + df['remontee des bulletins de salaire existants sans modifications']

In [57]:
#df[(df['Gestion bulletins de paie'] != 1) & (df['Gestion bulletins de paie'] != 0)]

In [58]:
#Remplacer les autres valeurs par 1 ou 0
df['Gestion bulletins de paie'] = df['Gestion bulletins de paie'].map({0:0, 1:1, 2:1, 3:1, 4:1, 5:1, 6:1})

In [59]:
df[(df['Gestion bulletins de paie'] != 1) & (df['Gestion bulletins de paie'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,situation intermediaire,tableau de bord,traitement activite partielle sur bulletin de paie,transmission de lentreprise,travaux de fin de mission,travaux exceptionnels covd,verification finale travaux,Assistance téléphonique-hotline,Autres travaux exceptionnels,Gestion bulletins de paie


In [60]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['bulletin de paie','bulletin de paie apprenti', 'bulletin de paie cddmoisextrasaisonnier', 'bulletin de paie clarifie', 'bulletin de paie refait', 'bulletin de paie simulation', 'bulletin de paie suite prudhomme', 'fiche de paie', 'remontee des bulletins de paie', 'remontee des bulletins de salaire existants avec modifications', 'remontee des bulletins de salaire existants sans modifications'], axis=1, inplace=True)

In [61]:
#Création famille coffre fort numérique
df['Coffre fort numérique'] =  df['coffre fort numerique digiiposte par salarie'] + df['coffre fort numerique digiposte  forfait mise en route']

In [62]:
#Remplacer les autres valeurs par 1 ou 0
df['Coffre fort numérique'] = df['Coffre fort numérique'].map({0:0, 1:1, 2:1})

In [63]:
df[(df['Gestion bulletins de paie'] != 1) & (df['Gestion bulletins de paie'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,tableau de bord,traitement activite partielle sur bulletin de paie,transmission de lentreprise,travaux de fin de mission,travaux exceptionnels covd,verification finale travaux,Assistance téléphonique-hotline,Autres travaux exceptionnels,Gestion bulletins de paie,Coffre fort numérique


In [64]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['coffre fort numerique digiiposte par salarie', 'coffre fort numerique digiposte  forfait mise en route'], axis=1, inplace=True)

In [65]:
#Création famille assistance téléphonique
df['contrat de travail et/ou avenant'] =  df['contrat de travail'] + df['contrat de travail specifique'] + df['contrats de travail'] + df['avenant au contrat de travail']

In [66]:
df[(df['contrat de travail et/ou avenant'] != 1) & (df['contrat de travail et/ou avenant'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,traitement activite partielle sur bulletin de paie,transmission de lentreprise,travaux de fin de mission,travaux exceptionnels covd,verification finale travaux,Assistance téléphonique-hotline,Autres travaux exceptionnels,Gestion bulletins de paie,Coffre fort numérique,contrat de travail et/ou avenant
4489,AGES2017010134700,GES,2017,7406,5.0,THB,"['025', '024', 'F025', 'F0251', '029', 'F0242']","contrats de travail, formalites de licenciemen...",54.25,3881.00,...,0,0,0,0,0,0,0,0,0,3
4490,AGES2017010135300,GES,2017,7402,6.0,THB,"['025', '029', 'F025', 'F0251', '027', '024', ...","contrats de travail, autres travaux en social,...",121.50,9030.75,...,0,0,0,0,0,0,0,0,0,3
4497,AGES2017010139300,GES,2017,7407,6.0,THB,"['025', '029', 'F025', 'F0243', 'F0242', '024'...","contrats de travail, autres travaux en social,...",45.00,3331.75,...,0,0,0,0,0,0,0,0,0,3
4508,AGES2017010226000,GES,2017,7408,3.0,THB,"['025', 'F025']","contrats de travail, contrat de travail",2.75,239.50,...,0,0,0,0,0,0,0,0,0,2
4509,AGES2017010226100,GES,2017,7985,4.0,EMG,"['025', '029', 'F0251', '024', 'F025']","contrats de travail, autres travaux en social,...",11.25,809.75,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21958,AGS02019012616400,GS0,2019,7407,4.0,THB,"['025', 'F0251']","contrats de travail, avenant au contrat de tra...",4.25,255.25,...,0,0,0,0,0,0,0,0,0,2
21974,AGS02019012630400,GS0,2019,7192,5.0,EMG,"['025', 'F025', 'F02504']","contrats de travail, contrat de travail, cerfa...",17.25,1319.75,...,0,0,0,0,0,0,0,0,0,2
22018,AGS02019012730300,GS0,2019,7225,3.0,EMG,"['025', 'F025', '024']","contrats de travail, contrat de travail, forma...",5.75,452.50,...,0,0,0,0,0,0,0,0,0,2
22024,AGS02019012738000,GS0,2019,A0883,5.0,EMG,"['025', '001', 'F025']","contrats de travail, renseignements client, co...",12.00,1027.63,...,0,0,0,0,0,0,0,0,0,2


In [67]:
#Remplacer les autres valeurs par 1 ou 0
df['contrat de travail et/ou avenant'] = df['contrat de travail et/ou avenant'].map({0:0, 1:1, 2:1, 3:1})

In [68]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['contrat de travail', 'contrat de travail specifique', 'contrats de travail', 'avenant au contrat de travail'], axis=1, inplace=True)

In [69]:
#Création famille contrôle des charges
df['Contrôle des charges'] =  df['contrle des charges'] + df['controle charges sociales'] + df['controle de charges']

In [70]:
df[(df['Contrôle des charges'] != 1) & (df['Contrôle des charges'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,transmission de lentreprise,travaux de fin de mission,travaux exceptionnels covd,verification finale travaux,Assistance téléphonique-hotline,Autres travaux exceptionnels,Gestion bulletins de paie,Coffre fort numérique,contrat de travail et/ou avenant,Contrôle des charges
24396,APOL2019012180700,POL,2019,7315,9.0,NAB,"['F0304', '0304', '0310', '0302', '029', 'F031...","assistance telephonique a lheure, assistance t...",578.25,8811.31,...,0,0,0,0,1,0,0,0,0,2
24436,APOL2019012870300,POL,2019,A1345,3.0,EMG,"['F0300', '0310', '0317', 'F0317', '029']","mise a disposition du logiciel de paie, mise ...",10.5,1240.0,...,0,0,0,0,0,0,0,0,0,2
24437,APOL2019012870400,POL,2019,A1346,3.0,EMG,"['F0300', '0303', '0317', '0310', '0316', '029...","mise a disposition du logiciel de paie, format...",139.25,1665.0,...,0,0,0,0,1,0,0,0,0,2


In [71]:
#Remplacer les autres valeurs par 1 ou 0
df['Contrôle des charges'] = df['Contrôle des charges'].map({0:0, 1:1, 2:1})

In [72]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['contrle des charges', 'controle charges sociales', 'controle de charges'], axis=1, inplace=True)

In [73]:
#Création famille déclaration des bénéficiaires effectifs
df['Déclaration des bénéficiaires effectifs'] =  df['declaration des beneficiaires effectifs'] + df['declaration des beneficiaires effectifs suivants']

In [74]:
df[(df['Déclaration des bénéficiaires effectifs'] != 1) & (df['Déclaration des bénéficiaires effectifs'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,travaux de fin de mission,travaux exceptionnels covd,verification finale travaux,Assistance téléphonique-hotline,Autres travaux exceptionnels,Gestion bulletins de paie,Coffre fort numérique,contrat de travail et/ou avenant,Contrôle des charges,Déclaration des bénéficiaires effectifs
4050,AGEJ2017010544700,GEJ,2017,7551,4.0,NAB,"['069', 'F06453', 'F06454', 'F06455']","divers juridique, ouverture et paraphe registr...",7.50,430.25,...,0,0,0,0,0,0,0,0,0,2
4075,AGEJ2017010577900,GEJ,2017,60248,6.0,_OLP NE PLUS UTILISER,"['065', '069', 'F06454', 'F06455']","formalites de dissolution et de liquidation, ...",10.00,667.25,...,0,0,0,0,0,0,0,0,0,2
4179,AGEJ2017010678000,GEJ,2017,7251,3.0,NAB,"['069', 'F06454', 'F06455']","divers juridique, declaration des beneficiaire...",4.75,257.00,...,0,0,0,0,0,0,0,0,0,2
4258,AGEJ2017010804200,GEJ,2017,A0315,2.0,JOP,"['069', 'F06454', 'F06455']","divers juridique, declaration des beneficiaire...",3.50,189.50,...,0,0,0,0,0,0,0,0,0,2
4260,AGEJ2017010804700,GEJ,2017,4895,2.0,JOP,"['069', 'F06453', 'F06454', 'F06455']","divers juridique, ouverture et paraphe registr...",4.25,215.00,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12347,AGEJ2018011725200,GEJ,2018,7052,3.0,VAL,"['F06454', 'F06455', '069']","declaration des beneficiaires effectifs, decla...",2.75,136.75,...,0,0,0,0,0,0,0,0,0,2
12348,AGEJ2018011725300,GEJ,2018,7159,8.0,DAC,"['F06454', 'F06455', '069', '065']","declaration des beneficiaires effectifs, decla...",18.25,1254.75,...,0,0,0,0,0,0,0,0,0,2
12353,AGEJ2018011725800,GEJ,2018,A0280,2.0,THB,"['F06454', 'F06455', '069']","declaration des beneficiaires effectifs, decla...",2.25,110.75,...,0,0,0,0,0,0,0,0,0,2
12354,AGEJ2018011725900,GEJ,2018,A0309,2.0,EMG,"['F06454', 'F06455', '069']","declaration des beneficiaires effectifs, decla...",2.25,110.75,...,0,0,0,0,0,0,0,0,0,2


In [75]:
#Remplacer les autres valeurs par 1 ou 0
df['Déclaration des bénéficiaires effectifs'] = df['Déclaration des bénéficiaires effectifs'].map({0:0, 1:1, 2:1})

In [76]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['declaration des beneficiaires effectifs', 'declaration des beneficiaires effectifs suivants'], axis=1, inplace=True)

In [77]:
#Création formalites de rupture de contrat de travail
df['Formalités de rupture de contrat de travail'] =  df['calcul cout rupture de contrat'] + df['formalites de rupture conventionnelle'] + df['formalites de licenciement'] + df['etablissement attestation pole emploi'] + df['mouvement de personnel sortie'] + df['mouvement de personnel sortie apprenti'] + df['mouvement de personnel sortie cddmoisextrasaisonnier']

In [78]:
df[(df['Formalités de rupture de contrat de travail'] != 1) & (df['Formalités de rupture de contrat de travail'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,travaux exceptionnels covd,verification finale travaux,Assistance téléphonique-hotline,Autres travaux exceptionnels,Gestion bulletins de paie,Coffre fort numérique,contrat de travail et/ou avenant,Contrôle des charges,Déclaration des bénéficiaires effectifs,Formalités de rupture de contrat de travail
4497,AGES2017010139300,GES,2017,7407,6.0,THB,"['025', '029', 'F025', 'F0243', 'F0242', '024'...","contrats de travail, autres travaux en social,...",45.00,3331.75,...,0,0,0,0,0,0,1,0,0,2
4553,AGES2017010392400,GES,2017,7232,3.0,EMG,"['024', '029', '025', 'F0241', 'F028', 'F0253']","formalites de licenciement, autres travaux en ...",10.50,1245.25,...,0,0,0,0,0,0,1,0,0,2
6624,ASOP2017010108000,SOP,2017,A0520,11.0,DAC,"['030', '029', '020', 'F0200', 'F0211', 'F0206...","cration migration silae, autres travaux en so...",314.50,10370.75,...,0,0,0,0,1,0,0,0,0,2
6626,ASOP2017010111600,SOP,2017,A0015,6.0,NAB,"['F0200', 'F0214', '020', '021', 'F0206', '029...","bulletin de paie, actualisation mensuelle aide...",69.50,3374.75,...,0,0,0,0,1,0,1,0,0,2
6628,ASOP2017010117300,SOP,2017,A0532,4.0,JOP,"['030', 'F0200', '029', 'F0211', '020', '021',...","cration migration silae, bulletin de paie, au...",84.00,3673.50,...,0,0,0,0,1,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35465,ASOP2020013189600,SOP,2020,2547,5.0,GSO,"['001', '029', '030', 'F0207.1', 'F0205', 'F02...","renseignements client, autres travaux en socia...",33.50,2018.03,...,1,0,0,0,1,0,0,0,0,2
35473,ASOP2020013221800,SOP,2020,A0647,12.0,THB,"['F0201', 'F0205', 'F02082', 'F0200', '029', '...","bulletin de paie simulation, mouvement de pers...",282.75,11400.42,...,1,0,0,0,1,0,0,0,0,2
35491,ASOP2020013282500,SOP,2020,3979,8.0,CBA,"['029', '04950', 'F02083', 'F0228', 'F0200', '...","autres travaux en social, travaux exceptionnel...",348.50,12351.66,...,1,0,0,0,1,0,0,0,0,2
35494,ASOP2020013332200,SOP,2020,A2142,11.0,JOP,"['029', '030', 'F0207', 'F0205', 'F02081', '02...","autres travaux en social, cration migration s...",98.00,5256.61,...,0,0,1,0,1,0,0,0,0,2


In [79]:
#Remplacer les autres valeurs par 1 ou 0
df['Formalités de rupture de contrat de travail'] = df['Formalités de rupture de contrat de travail'].map({0:0, 1:1, 2:1, 3:1, 4:1})

In [80]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['calcul cout rupture de contrat', 'formalites de rupture conventionnelle', 'formalites de licenciement', 'etablissement attestation pole emploi', 'mouvement de personnel sortie', 'mouvement de personnel sortie apprenti', 'mouvement de personnel sortie cddmoisextrasaisonnier'], axis=1, inplace=True)

In [81]:
#Création famille aide à l'embauche apprentissage
df["Aide à l'embauche apprentissage"] =  df['actualisation mensuelle aide a lembauche  apprenti'] + df['aide a lembauche  apprenti']

In [82]:
#df[(df["Aide à l'embauche apprentissage"] != 1) & (df["Aide à l'embauche apprentissage"] != 0)]

In [83]:
#Remplacer les autres valeurs par 1 ou 0
df["Aide à l'embauche apprentissage"] = df["Aide à l'embauche apprentissage"].map({0:0, 1:1, 2:1})

In [84]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['actualisation mensuelle aide a lembauche  apprenti', 'aide a lembauche  apprenti'], axis=1, inplace=True)

In [85]:
#Création famille formation client
df['Formation client'] =  df['animation et formatioin'] + df['formation technique a lutilisation du logiciel'] + df['formation technique utilisation logiciel'] + df['formation techniques de paie appliques sur silae'] + df['formation tl pas']

In [86]:
df[(df['Formation client'] != 1) & (df['Formation client'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,Assistance téléphonique-hotline,Autres travaux exceptionnels,Gestion bulletins de paie,Coffre fort numérique,contrat de travail et/ou avenant,Contrôle des charges,Déclaration des bénéficiaires effectifs,Formalités de rupture de contrat de travail,Aide à l'embauche apprentissage,Formation client
6580,APOL2017010389800,POL,2017,7645,8.0,DAC,"['0301', '0304', 'F0304', 'F0300', '0303', '03...","assistance tlphonique annuelle , assistance tl...",1697.99,9131.63,...,1,0,0,0,0,0,0,0,0,2
6599,APOL2017010665600,POL,2017,A0735,3.0,JOP,"['0302', 'F0318', 'F03112', 'F0312', 'F0313', ...","cration de dossier, formation technique a luti...",87.25,2274.5,...,0,0,1,0,0,0,0,0,0,2
6600,APOL2017010673100,POL,2017,60070,5.0,JOP,"['F0318', '0303', '0304', 'F0304', 'F0300', '0...",formation technique a lutilisation du logiciel...,123.75,3322.0,...,1,0,0,0,0,0,0,0,0,2
6601,APOL2017010673600,POL,2017,A0745,5.0,VAL,"['0302', 'F0300', '0304', 'F0304', '0303', 'F0...","cration de dossier, mise a disposition du logi...",30.56,1308.13,...,1,0,0,0,0,0,0,0,0,2
6602,APOL2017010766700,POL,2017,4799,3.0,GSO,"['0303', 'F0318', 'F03113', 'F0312', 'F0313', ...",formation techniques de paie appliques sur sil...,90.25,2086.25,...,1,0,1,0,0,0,0,0,0,2
6603,APOL2017010774300,POL,2017,7315,3.0,NAB,"['F0300', 'F0318', '0303', '0304', 'F0304']","mise a disposition du logiciel de paie, format...",190.5,1863.75,...,1,0,0,0,0,0,0,0,0,2
15591,APOL2018011433200,POL,2018,60349,5.0,NAB,"['0302', '0304', 'F0318', '0303', 'F0304', 'F0...","cration de dossier, assistance tlphonique lhe...",319.25,4671.5,...,1,0,1,0,0,0,0,0,0,2
15594,APOL2018011446400,POL,2018,11871,3.0,CIT,"['0302', 'F0318', '0303', 'F0300', 'F0216', '0...","cration de dossier, formation technique a luti...",42.25,1319.0,...,1,0,1,0,0,0,0,0,0,2
15605,APOL2018011474500,POL,2018,3262,6.0,CIT,"['F0304', 'F0216', 'F0300', '0304', '029', '03...","assistance telephonique a lheure, bulletin de ...",157.75,3058.75,...,1,0,1,0,0,0,0,0,0,2
15611,APOL2018011475100,POL,2018,3979,6.0,CBA,"['F0304', 'F0300', 'F0216', '0302', '0304', '0...","assistance telephonique a lheure, mise a dispo...",341.5,6267.25,...,1,0,1,0,0,0,0,0,0,3


In [87]:
#Remplacer les autres valeurs par 1 ou 0
df["Formation client"] = df['Formation client'].map({0:0, 1:1, 2:1, 3:1})

In [88]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['animation et formatioin', 'formation technique a lutilisation du logiciel', 'formation technique utilisation logiciel', 'formation techniques de paie appliques sur silae', 'formation tl pas'], axis=1, inplace=True)

In [89]:
#Création famille secrétariat juridique
df['Secrétariat juridique'] =  df['secretariat juridique annuel'] + df['secretariat juridique annuel eurl'] + df['secretariat juridique annuel sa'] + df['secretariat juridique annuel sas'] + df['secretariat juridique annuel sasu'] + df['secretariat juridique annuel sceaearl'] + df['secretariat juridique annuel scmsc'] + df['secrtariat juridique annuel sci  sc portefeuille'] + df['divers juridique'] + df['juridique'] + df['secretariat juridique annuel sarl  snc']

In [90]:
df[(df['Secrétariat juridique'] != 1) & (df['Secrétariat juridique'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,Autres travaux exceptionnels,Gestion bulletins de paie,Coffre fort numérique,contrat de travail et/ou avenant,Contrôle des charges,Déclaration des bénéficiaires effectifs,Formalités de rupture de contrat de travail,Aide à l'embauche apprentissage,Formation client,Secrétariat juridique
740,ACOM2017008290300,COM,2017,5605,5.0,VAL,"['069', '001', '060', '016', '011', '013']","divers juridique, renseignements client, secre...",8.75,514.25,...,0,0,0,0,0,0,0,0,0,2
1100,ACOM2017008337600,COM,2017,40138,3.0,DAC,"['016', '012', '001', '060', '010', '049', '06...","autres declarations fiscales, declarations tva...",20.25,1626.00,...,1,0,0,0,0,0,0,0,0,2
1148,ACOM2017008343400,COM,2017,60248,9.0,_OLP NE PLUS UTILISER,"['069', '060', '001', '016', '012', '013', '011']","divers juridique, secretariat juridique annuel...",15.00,1068.75,...,0,0,0,0,0,0,0,0,0,2
1903,ACOM2017009195400,COM,2017,4951,10.0,_OLP NE PLUS UTILISER,"['011', '010', '001', '012', '016', '018', '06...","controle et revision, saisie et centralisation...",169.25,8891.00,...,0,0,0,0,0,0,0,0,0,2
3041,ACOM2017010154200,COM,2017,7829,12.0,THB,"['001', '010', '041', '060', '049', '012', '01...","renseignements client, saisie et centralisatio...",167.00,10277.25,...,1,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33301,AJUR2020013246600,JUR,2020,A0852,3.0,FRL,"['069', '060']","divers juridique, secretariat juridique annuel",3.50,237.50,...,0,0,0,0,0,0,0,0,0,2
33304,AJUR2020013276400,JUR,2020,A2103,3.0,EMG,"['060', '069']","secretariat juridique annuel, divers juridique",4.75,324.64,...,0,0,0,0,0,0,0,0,0,2
33312,AJUR2020013318700,JUR,2020,A1248,4.0,SLE,"['060', '069']","secretariat juridique annuel, divers juridique",4.75,301.89,...,0,0,0,0,0,0,0,0,0,2
33331,AJUR2020013561800,JUR,2020,A1041,1.0,EMG,"['060', '069']","secretariat juridique annuel, divers juridique",6.00,489.00,...,0,0,0,0,0,0,0,0,0,2


In [91]:
#Remplacer les autres valeurs par 1 ou 0
df["Secrétariat juridique"] = df['Secrétariat juridique'].map({0:0, 1:1, 2:1, 3:1})

In [92]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['secretariat juridique annuel', 'secretariat juridique annuel eurl', 'secretariat juridique annuel sa', 'secretariat juridique annuel sas', 'secretariat juridique annuel sasu', 'secretariat juridique annuel sceaearl','secretariat juridique annuel scmsc','secrtariat juridique annuel sci  sc portefeuille', 'divers juridique', 'juridique'], axis=1, inplace=True)

In [93]:
#Création famille Audit social
df['Audit social'] =  df['audit bulletins de paie etou charges sociales'] + df['audit social']

In [94]:
df[(df['Audit social'] != 1) & (df['Audit social'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,Gestion bulletins de paie,Coffre fort numérique,contrat de travail et/ou avenant,Contrôle des charges,Déclaration des bénéficiaires effectifs,Formalités de rupture de contrat de travail,Aide à l'embauche apprentissage,Formation client,Secrétariat juridique,Audit social


In [95]:
#Création famille Formalités embauche
df['Formalités embauche'] =  df['formalites dembauche'] + df['cerfa apprentissage'] + df['cerfa contrat de professionalisation'] + df['mouvement de personnel entree'] + df['mouvement de personnel entree apprenti'] + df['mouvement de personnel entree cddmoisextrasaisonnier'] + df['contrat de travail et/ou avenant']

In [96]:
df[(df['Formalités embauche'] != 1) & (df['Formalités embauche'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,Coffre fort numérique,contrat de travail et/ou avenant,Contrôle des charges,Déclaration des bénéficiaires effectifs,Formalités de rupture de contrat de travail,Aide à l'embauche apprentissage,Formation client,Secrétariat juridique,Audit social,Formalités embauche
4451,AGES2017010108900,GES,2017,5649,5.0,SLE,"['025', '023']","contrats de travail, formalites dembauche",12.50,662.25,...,0,1,0,0,0,0,0,0,0,2
4453,AGES2017010109100,GES,2017,6474,3.0,STL,"['025', '023']","contrats de travail, formalites dembauche",2.75,138.75,...,0,1,0,0,0,0,0,0,0,2
4456,AGES2017010112900,GES,2017,4951,4.0,_OLP NE PLUS UTILISER,"['023', '025', '029', '024']","formalites dembauche, contrats de travail, aut...",30.00,1820.75,...,0,1,0,0,1,0,0,0,0,2
4457,AGES2017010113000,GES,2017,79474,6.0,GSO,"['023', '025', '045', '029', 'F0220']","formalites dembauche, contrats de travail, ass...",16.25,1181.50,...,0,1,0,0,0,0,0,0,0,2
4458,AGES2017010113200,GES,2017,7642,5.0,JOP,"['025', '023', '024']","contrats de travail, formalites dembauche, for...",26.75,1685.50,...,0,1,0,0,1,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35473,ASOP2020013221800,SOP,2020,A0647,12.0,THB,"['F0201', 'F0205', 'F02082', 'F0200', '029', '...","bulletin de paie simulation, mouvement de pers...",282.75,11400.42,...,0,0,0,0,1,0,0,0,0,2
35482,ASOP2020013228000,SOP,2020,4154,2.0,JOP,"['030', '04950', '029', 'F0207', 'F0205', 'F02...","cration migration silae, travaux exceptionnel...",64.75,3274.02,...,0,0,0,0,1,0,0,0,0,2
35486,ASOP2020013247700,SOP,2020,A2088,1.0,VAL,"['F0200', 'F02081', 'F02052', '020', '029', '0...","bulletin de paie, forfait mensuel salaries ,...",49.00,2280.51,...,0,0,0,0,1,0,0,0,0,2
35507,ASOP2020013403400,SOP,2020,A1891,7.0,JOD,"['030', '029', '020', '0291', '001', 'F0207', ...","cration migration silae, autres travaux en so...",77.50,4260.99,...,0,0,0,0,1,0,0,0,0,2


In [97]:
#Remplacer les autres valeurs par 1 ou 0
df["Formalités embauche"] = df['Formalités embauche'].map({0:0, 1:1, 2:1, 3:1, 4:1, 5:1})

In [98]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['formalites dembauche', 'cerfa apprentissage', 'cerfa contrat de professionalisation', 'mouvement de personnel entree', 'mouvement de personnel entree apprenti', 'mouvement de personnel entree cddmoisextrasaisonnier', 'contrat de travail et/ou avenant'], axis=1, inplace=True)

In [99]:
#Création famille Sous-traitance
df['Sous-traitance'] =  df['saisie lignes importes'] + df['saisie primo bpo'] + df['hbergement pices comptables'] 

In [100]:
df[(df['Sous-traitance'] != 1) & (df['Sous-traitance'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,Coffre fort numérique,Contrôle des charges,Déclaration des bénéficiaires effectifs,Formalités de rupture de contrat de travail,Aide à l'embauche apprentissage,Formation client,Secrétariat juridique,Audit social,Formalités embauche,Sous-traitance
40,ACOM2017008059000,COM,2017,40082,4.0,DAC,"['011', '012', '013', '010', '016', '210', '21...","controle et revision, declarations tva, compte...",4969.25,5067.46,...,0,0,0,0,0,0,0,0,0,2
279,ACOM2017008223100,COM,2017,6129,3.0,DAC,"['210', '212', '211', '010', '012', '013', '01...","saisie primo bpo, saisie lignes importes, hber...",7875.0,4536.99,...,0,0,0,0,0,0,0,0,0,3
312,ACOM2017008227400,COM,2017,40032,5.0,DAC,"['012', '010', '210', '211', '016', '011', '00...","declarations tva, saisie et centralisation, sa...",8155.75,5370.65,...,0,0,0,0,0,0,0,0,0,2
316,ACOM2017008227900,COM,2017,40066,7.0,JOP,"['012', '016', '001', '010', '210', '211', '01...","declarations tva, autres declarations fiscales...",7531.5,6106.62,...,0,0,0,0,0,0,0,0,0,2
465,ACOM2017008247900,COM,2017,5954,3.0,DAC,"['210', '211', '001', '011', '010']","saisie primo bpo, hbergement pices comptables,...",250.75,284.47,...,0,0,0,0,0,0,0,0,0,2
467,ACOM2017008248100,COM,2017,5956,7.0,DAC,"['011', '010', '012', '210', '211', '212', '04...","controle et revision, saisie et centralisation...",3865.0,2301.46,...,0,0,0,0,0,0,0,0,0,3
473,ACOM2017008249100,COM,2017,6139,5.0,DAC,"['012', '010', '210', '212', '211', '011', '01...","declarations tva, saisie et centralisation, sa...",4631.0,3208.19,...,0,0,0,0,0,0,0,0,0,3
1196,ACOM2017008367900,COM,2017,6944,3.0,DAC,"['010', '212', '211', '210', '012', '011', '013']","saisie et centralisation, saisie lignes import...",12942.75,9254.02,...,0,0,0,0,0,0,0,0,0,3
1227,ACOM2017008488000,COM,2017,6988,4.0,DAC,"['012', '010', '210', '211', '011', '019', '013']","declarations tva, saisie et centralisation, sa...",397.75,1850.36,...,0,0,0,0,0,0,0,0,0,2
1332,ACOM2017008594000,COM,2017,7135,7.0,DAC,"['012', '043', '211', '210', '001', '212', '01...","declarations tva, tableau de bord, hbergement ...",12177.75,6982.14,...,0,0,0,0,0,0,0,0,0,3


In [101]:
#Remplacer les autres valeurs par 1 ou 0
df['Sous-traitance'] = df['Sous-traitance'].map({0:0, 1:1, 2:1, 3:1})

In [102]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['saisie lignes importes', 'saisie primo bpo', 'hbergement pices comptables'], axis=1, inplace=True)

In [103]:
#Création famille Tableau de bord
df['Tableau de bord'] =  df['tb'] + df['tableau de bord']

In [104]:
df[(df['Tableau de bord'] != 1) & (df['Tableau de bord'] != 0)]

Unnamed: 0,code_mission,type_mission,exercice,code_client,total_assistant,manager,code_article,libelle_article,temps,valorisation_temps,...,Contrôle des charges,Déclaration des bénéficiaires effectifs,Formalités de rupture de contrat de travail,Aide à l'embauche apprentissage,Formation client,Secrétariat juridique,Audit social,Formalités embauche,Sous-traitance,Tableau de bord
16855,ATB 2018010659800,TB,2018,A0576,2.0,JOP,['043'],tableau de bord,7.25,510.25,...,0,0,0,0,0,0,0,0,0,2
16856,ATB 2018010778600,TB,2018,A0648,2.0,JOP,['043'],tableau de bord,9.5,575.5,...,0,0,0,0,0,0,0,0,0,2
16857,ATB 2018010797200,TB,2018,A0523,4.0,FRL,"['049', '011', '043']","autres travaux exceptionnels, controle et revi...",39.75,4145.5,...,0,0,0,0,0,0,0,0,0,2
16858,ATB 2018012107200,TB,2018,7302,1.0,NAB,['F07000'],tableau de bord,4.0,800.0,...,0,0,0,0,0,0,0,0,0,2
16859,ATB 2018012107700,TB,2018,7315,1.0,NAB,['F07000'],tableau de bord,4.0,800.0,...,0,0,0,0,0,0,0,0,0,2
25689,ATB 2019011968300,TB,2019,A0576,2.0,JOP,['043'],tableau de bord,13.75,746.75,...,0,0,0,0,0,0,0,0,0,2
25690,ATB 2019011983300,TB,2019,A0648,1.0,JOP,['043'],tableau de bord,2.0,106.0,...,0,0,0,0,0,0,0,0,0,2
25691,ATB 2019012100400,TB,2019,A0016,3.0,VAL,"['F07000', '043']","tableau de bord, tableau de bord",18.5,1343.5,...,0,0,0,0,0,0,0,0,0,3
25692,ATB 2019012107300,TB,2019,7302,2.0,NAB,"['F07000', '043']","tableau de bord, tableau de bord",1.5,2443.75,...,0,0,0,0,0,0,0,0,0,3
25693,ATB 2019012107800,TB,2019,7315,2.0,NAB,"['F07000', '043']","tableau de bord, tableau de bord",1.5,2443.75,...,0,0,0,0,0,0,0,0,0,3


In [105]:
#Remplacer les autres valeurs par 1 ou 0
df['Tableau de bord'] = df['Tableau de bord'].map({0:0, 1:1, 2:1, 3:1})

In [106]:
#Supprimer les colonnes agrégées dans la famille d'activité
df.drop(['tb', 'tableau de bord'], axis=1, inplace=True)

In [107]:
#Supprimer les colonnes qui correspondent aux activités non facturables
df.drop(['non facturable commissariat', 'non facturable comptabilite', 'non facturable juridique', 'non facturable social'], axis=1, inplace=True)

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35583 entries, 0 to 35582
Columns: 166 entries, code_mission to Tableau de bord
dtypes: float64(4), int64(152), object(10)
memory usage: 45.1+ MB


## Bag of words sur les formes juridiques

In [109]:
text_data_forme = np.array(df['forme'].values)
text_data_forme

array(['SAS', 'EURL', 'SARL', ..., 'SAS', 'SAS', 'EURL'], dtype=object)

In [110]:
# Create the bag of words feature matrix
count = CountVectorizer()
bow_forme = count.fit_transform(text_data_forme)

# Show feature matrix
bow_forme.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [111]:
feature_names = count.get_feature_names()

In [112]:
df_formes = pd.DataFrame(bow_forme.toarray(), columns=feature_names)
df_formes

Unnamed: 0,ass,coop,divers,earl,eirl,eurl,gaec,gfa,gfo,gfr,...,scf,sci,scm,scp,sdf,selarl,selas,sem,sep,snc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35578,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35581,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [113]:
df = pd.concat([df, df_formes], axis=1)

In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35583 entries, 0 to 35582
Columns: 197 entries, code_mission to snc
dtypes: float64(4), int64(183), object(10)
memory usage: 53.5+ MB


In [115]:
df.forme.value_counts()

SARL           9548
IND            6021
SCI            5728
EURL           4451
SAS            3613
SASU           2019
Particulier     970
ASS             615
SC              548
SELARL          444
SCM             358
EIRL            247
SNC             210
SA              200
Divers          128
EARL            125
SCEA            120
SCP              81
SEP              26
GIE              22
GFA              19
SELAS            16
GFR              15
GAEC             12
GFO              11
INDIV            11
SCF               8
SDF               8
COOP              4
SADIR             3
SEM               2
Name: forme, dtype: int64

In [116]:
df.columns[166:]

Index(['ass', 'coop', 'divers', 'earl', 'eirl', 'eurl', 'gaec', 'gfa', 'gfo',
       'gfr', 'gie', 'ind', 'indiv', 'particulier', 'sa', 'sadir', 'sarl',
       'sas', 'sasu', 'sc', 'scea', 'scf', 'sci', 'scm', 'scp', 'sdf',
       'selarl', 'selas', 'sem', 'sep', 'snc'],
      dtype='object')

## Exportation du dataset en csv pour analyse et machine learning

In [117]:
df.to_csv("./Data/dataset-ML.csv", index=False)