# Preprocessing Task
- Load dataset as DataFrame. 
- Skip the natural language and only focus on the categorical columns.
- Observating, grading, missing value and normalization. 

In [1]:
import pandas as pd
import numpy as np

# Load the data
Click [here](https://www.data.gouv.fr/fr/datasets/observatoire-edtech/) for more info about the dataset.

In [2]:
# !pip install xlrd # install extra dependency to read excel
data = pd.read_excel('./observatoireedtechbdd14102019.xlsx', sep='\t', parse_dates=True, squeeze=True)

# First look at the data
- Head;
- Name of columns;
- Type of columns
- The percentage of missing values in each column;
- The percentage of missing values in each row;
- Describe.

### Head

In [3]:
data.head(4)

Unnamed: 0,ID,Nom startup,description fr,description en,web,creation,Dirigeants,Nbr employes,employeesref,Code postal,...,filter_tech_12,filter_tech_13,users_1,users_2,users_3,users_4,users_5,users_6,users_7,users_8
0,2,1to1PROGRESS,Organisme de formation linguistique à distance...,French training organism specialized in lingui...,1to1progress.com,2011,"Laurent Zalc, Jonathan Ways",1-5,4.0,75014,...,,,,,,,,Particuliers,Individus en emploi,Individus en reconversion ou sans emploi
1,3,360Learning,"Créez et diffusez rapidement vos cours, en lig...",Quickly create and send online courses with en...,http://360learning.com,2010,"Nicolas Hernandez, Guillaume Alary, Sébastien ...",41 — 80,5.0,75008,...,,,,,Étudiants,Apprentis et étudiants en voie professionnelle,Établissement et professionnels de l'enseignement,Particuliers,,
2,4,3W Academy,formation au métier de développeur,We train web developpers,https://3wa.fr,2012,Djamchid Dalili,1-5,3.0,75018,...,,,,,Étudiants,Apprentis et étudiants en voie professionnelle,Établissement et professionnels de l'enseignement,Particuliers,,Individus en reconversion ou sans emploi
3,6,Academyk,La mission d’Academyk est de développer les co...,Academyk's mission is to develop knowledge and...,academyk.org,2015,Julien Andre,1-5,1.0,75003,...,,,,,Étudiants,Apprentis et étudiants en voie professionnelle,Établissement et professionnels de l'enseignement,Particuliers,,


### Name of columns

In [4]:
data.columns

Index(['ID', 'Nom startup', 'description fr', 'description en', 'web',
       'creation', 'Dirigeants', 'Nbr employes', 'employeesref', 'Code postal',
       'Région', 'Logo', 'A levé des fonds', 'Souhaite levé des fonds',
       'Récompensé', 'enregistré le', 'Id video youtube', 'internationale',
       'Pays', 'facebook', 'twitter', 'linkedin', 'type', 'filter_types_1',
       'filter_types_2', 'filter_types_3', 'filter_clients_1',
       'filter_clients_2', 'filter_clients_3', 'filter_clients_4',
       'filter_clients_5', 'filter_clients_6', 'filter_services_1',
       'filter_services_2', 'filter_services_3', 'filter_services_4',
       'filter_services_5', 'filter_services_6', 'filter_services_7',
       'filter_services_8', 'filter_services_9', 'filter_secteurs_1',
       'filter_secteurs_2', 'filter_secteurs_3', 'filter_tech_1',
       'filter_tech_2', 'filter_tech_3', 'filter_tech_4', 'filter_tech_5',
       'filter_tech_6', 'filter_tech_7', 'filter_tech_8', 'filter_tech_9',
 

### Type of columns

In [5]:
data.dtypes

ID                 int64
Nom startup       object
description fr    object
description en    object
web               object
                   ...  
users_4           object
users_5           object
users_6           object
users_7           object
users_8           object
Length: 65, dtype: object

### The percentage of missing values in each column

In [6]:
data.isnull().sum()/len(data)*100

ID                 0.000000
Nom startup        0.000000
description fr     0.000000
description en     0.000000
web                0.000000
                    ...    
users_4           68.649886
users_5           55.835240
users_6           57.665904
users_7           55.377574
users_8           70.938215
Length: 65, dtype: float64

### The percentage of missing values in each row

In [7]:
for i in range(len(data)) :
    print("Nan in row ", i , " : " ,  data.iloc[i].isnull().sum()/len(data.columns)*100)

Nan in row  0  :  52.307692307692314
Nan in row  1  :  44.61538461538462
Nan in row  2  :  50.76923076923077
Nan in row  3  :  52.307692307692314
Nan in row  4  :  56.92307692307692
Nan in row  5  :  50.76923076923077
Nan in row  6  :  56.92307692307692
Nan in row  7  :  41.53846153846154
Nan in row  8  :  49.23076923076923
Nan in row  9  :  49.23076923076923
Nan in row  10  :  60.0
Nan in row  11  :  49.23076923076923
Nan in row  12  :  55.38461538461539
Nan in row  13  :  53.84615384615385
Nan in row  14  :  56.92307692307692
Nan in row  15  :  58.46153846153847
Nan in row  16  :  56.92307692307692
Nan in row  17  :  58.46153846153847
Nan in row  18  :  56.92307692307692
Nan in row  19  :  49.23076923076923
Nan in row  20  :  44.61538461538462
Nan in row  21  :  56.92307692307692
Nan in row  22  :  35.38461538461539
Nan in row  23  :  52.307692307692314
Nan in row  24  :  41.53846153846154
Nan in row  25  :  55.38461538461539
Nan in row  26  :  66.15384615384615
Nan in row  27  :  49

Nan in row  291  :  56.92307692307692
Nan in row  292  :  56.92307692307692
Nan in row  293  :  52.307692307692314
Nan in row  294  :  56.92307692307692
Nan in row  295  :  56.92307692307692
Nan in row  296  :  44.61538461538462
Nan in row  297  :  60.0
Nan in row  298  :  58.46153846153847
Nan in row  299  :  43.07692307692308
Nan in row  300  :  61.53846153846154
Nan in row  301  :  38.46153846153847
Nan in row  302  :  52.307692307692314
Nan in row  303  :  46.15384615384615
Nan in row  304  :  58.46153846153847
Nan in row  305  :  53.84615384615385
Nan in row  306  :  55.38461538461539
Nan in row  307  :  58.46153846153847
Nan in row  308  :  52.307692307692314
Nan in row  309  :  50.76923076923077
Nan in row  310  :  55.38461538461539
Nan in row  311  :  49.23076923076923
Nan in row  312  :  58.46153846153847
Nan in row  313  :  52.307692307692314
Nan in row  314  :  52.307692307692314
Nan in row  315  :  43.07692307692308
Nan in row  316  :  49.23076923076923
Nan in row  317  :  

### Describe

In [8]:
data.describe(include='all')

Unnamed: 0,ID,Nom startup,description fr,description en,web,creation,Dirigeants,Nbr employes,employeesref,Code postal,...,filter_tech_12,filter_tech_13,users_1,users_2,users_3,users_4,users_5,users_6,users_7,users_8
count,437.0,437,437,437,437,437.0,434,437,301.0,437.0,...,27,37,49,182,205,137,193,185,195,127
unique,,437,437,437,437,31.0,432,16,,,...,1,1,1,1,1,1,1,1,1,1
top,,Academyk,Plateforme de e-learning pour les professionne...,School Online University is the digital soluti...,www.les-sherpas.co,2015.0,Sylvain Pont,1-5,,,...,IoT / digital school places,Autres,Élèves de maternelle,Élèves du CP à la terminale,Étudiants,Apprentis et étudiants en voie professionnelle,Établissement et professionnels de l'enseignement,Particuliers,Individus en emploi,Individus en reconversion ou sans emploi
freq,,1,1,1,1,65.0,2,290,,,...,27,37,49,182,205,137,193,185,195,127
mean,245.789474,,,,,,,,1.976744,68278.12357,...,,,,,,,,,,
std,150.152175,,,,,,,,1.33021,21514.502425,...,,,,,,,,,,
min,2.0,,,,,,,,1.0,2491.0,...,,,,,,,,,,
25%,119.0,,,,,,,,1.0,64000.0,...,,,,,,,,,,
50%,232.0,,,,,,,,1.0,75008.0,...,,,,,,,,,,
75%,377.0,,,,,,,,3.0,75019.0,...,,,,,,,,,,


# First clean of data
### Drop the columns which is not numeric nor categorical
To get rid of natural language processing

Columns to be dropped:
- Nom startup
- description fr
- description en
- web
- Dirigeants
- Logo

In [9]:
data.drop(['Nom startup', 'description fr','description en', 'web', 'Dirigeants', 'Logo', 'enregistré le', 'Pays'], axis=1, inplace=True)

data.head(4)

Unnamed: 0,ID,creation,Nbr employes,employeesref,Code postal,Région,A levé des fonds,Souhaite levé des fonds,Récompensé,Id video youtube,...,filter_tech_12,filter_tech_13,users_1,users_2,users_3,users_4,users_5,users_6,users_7,users_8
0,2,2011,1-5,4.0,75014,IDF,1,0,0,,...,,,,,,,,Particuliers,Individus en emploi,Individus en reconversion ou sans emploi
1,3,2010,41 — 80,5.0,75008,IDF,1,0,0,,...,,,,,Étudiants,Apprentis et étudiants en voie professionnelle,Établissement et professionnels de l'enseignement,Particuliers,,
2,4,2012,1-5,3.0,75018,IDF,1,0,1,,...,,,,,Étudiants,Apprentis et étudiants en voie professionnelle,Établissement et professionnels de l'enseignement,Particuliers,,Individus en reconversion ou sans emploi
3,6,2015,1-5,1.0,75003,IDF,1,0,1,,...,,,,,Étudiants,Apprentis et étudiants en voie professionnelle,Établissement et professionnels de l'enseignement,Particuliers,,


# Column preprocessing - Type 1

## Column creation
### convert unformatted data
- Categorical
- Replace '2014-01-01 00:00:00' with 2014 (missing values < 50%)

In [10]:
#orignal data
data.creation.unique()

array([2011, 2010, 2012, 2015, 2013, '2014-01-01 00:00:00', 2016, 2014,
       2017, 2004, 2009, 2000, 2018, 2001, 2002, 2005, 2006, 2007, 2003,
       2008, 1986, 1999, 1997, 1994, 1987, 1992, 1968, 1998, 2019, 1991,
       1995], dtype=object)

In [11]:
if (data.creation.isnull().sum()/len(data)) > 50:
    print('meaningless column')
else:
    data.creation.replace('2014-01-01 00:00:00', 2014, inplace=True)
    print('Done')

data.creation.unique()

Done


array([2011, 2010, 2012, 2015, 2013, 2014, 2016, 2017, 2004, 2009, 2000,
       2018, 2001, 2002, 2005, 2006, 2007, 2003, 2008, 1986, 1999, 1997,
       1994, 1987, 1992, 1968, 1998, 2019, 1991, 1995], dtype=int64)

## Column Nbr employes
### convert unformatted data
- Categorical
- Replace '1-5', '1 — 5' and '1-mai' with '[1, 5]';
- Replace '6-10', '6 — 10' and '6—10' with '[6, 10]';
- Replace '11-20', '11 — 20' and 'nov.-20' with '[11, 20]';
- Replace '21-40' and '21 — 40' with '[21, 40]';
- Replace '41-80' and '41 — 80' with '[41, 80]';
- Replace '81-100' and '81 — 100' with '[81, 100]';
- Replace '>100' with '[100, ...]';

In [12]:
#orignal data
data['Nbr employes'].unique()

array(['1-5', '41 — 80', '11-20', '1 — 5', '81-100', 'nov.-20', '6-10',
       '21-40', '41-80', '>100', '21 — 40', '6 — 10', '81 — 100', '6—10',
       '11 — 20', '1-mai'], dtype=object)

In [13]:
data['Nbr employes'].replace('1-5', '[1, 5]', inplace=True)
data['Nbr employes'].replace('1 — 5', '[1, 5]', inplace=True)
data['Nbr employes'].replace('1-mai', '[1, 5]', inplace=True)
data['Nbr employes'].replace('6-10', '[6, 10]', inplace=True)
data['Nbr employes'].replace('6 — 10', '[6, 10]', inplace=True)
data['Nbr employes'].replace('6—10', '[6, 10]', inplace=True)
data['Nbr employes'].replace('11-20', '[11, 20]', inplace=True)
data['Nbr employes'].replace('11 — 20', '[11, 20]', inplace=True)
data['Nbr employes'].replace('nov.-20', '[11, 20]', inplace=True)
data['Nbr employes'].replace('21-40', '[21, 40]', inplace=True)
data['Nbr employes'].replace('21 — 40', '[21, 40]', inplace=True)
data['Nbr employes'].replace('41-80','[41, 80]', inplace=True)
data['Nbr employes'].replace('41 — 80', '[41, 80]', inplace=True)
data['Nbr employes'].replace('81-100' , '[81, 100]', inplace=True)
data['Nbr employes'].replace('81 — 100' , '[81, 100]', inplace=True)
data['Nbr employes'].replace('>100' , '[100, ...]', inplace=True)
                             
data['Nbr employes'].unique()

array(['[1, 5]', '[41, 80]', '[11, 20]', '[81, 100]', '[6, 10]',
       '[21, 40]', '[100, ...]'], dtype=object)

## Column employeesref
### convert unformatted data
- Categorical
- Replace nan with the mean value 1.98 (missing values < 50%)

In [14]:
#orignal data
data['employeesref'].unique()

array([ 4.,  5.,  3.,  1.,  2.,  7.,  6., nan])

In [15]:
data['employeesref'].describe(include='all')

count    301.000000
mean       1.976744
std        1.330210
min        1.000000
25%        1.000000
50%        1.000000
75%        3.000000
max        7.000000
Name: employeesref, dtype: float64

In [16]:
if (data.creation.isnull().sum()/len(data)) > 50:
    print('meaningless column')
else:
    data['employeesref'].fillna(1.98, inplace=True)
    print('Done')
                             
data['employeesref'].unique()

Done


array([4.  , 5.  , 3.  , 1.  , 2.  , 7.  , 6.  , 1.98])

In [45]:
data['employeesref'].dtypes

dtype('float64')

# Column preprocessing - Type 2

## Column Région
- Categorical
- Replace nan with the 'Unknown' (missing values < 50%)

In [17]:
#orignal data
data['Région'].unique()

array(['IDF', 'ARA', 'PDLL', 'OCC', 'NO', nan, 'HDF', 'GE', 'PACA', 'BR',
       'BFC', 'CO', 'CVDL', 'OTH'], dtype=object)

In [18]:
if (data.creation.isnull().sum()/len(data)) > 50:
    print('meaningless column')
else:
    data['Région'].fillna('Unknown', inplace=True)
    print('Done')
                             
data['Région'].unique()

Done


array(['IDF', 'ARA', 'PDLL', 'OCC', 'NO', 'Unknown', 'HDF', 'GE', 'PACA',
       'BR', 'BFC', 'CO', 'CVDL', 'OTH'], dtype=object)

## Column internationale
- Categorical
- Replace nan with the -1 (missing values < 50%)

In [19]:
#orignal data
data['internationale'].unique()

array([nan,  0.,  1.])

In [20]:
if (data.internationale.isnull().sum()/len(data)) > 50:
    print('meaningless column')
else:
    data['internationale'].fillna(-1, inplace=True)
    print('Done')
                             
data['internationale'].unique()

Done


array([-1.,  0.,  1.])

# Column preprocessing - Type 3

## Column Id video youtube
- Categorical
- Replace nan with the -1 (missing values < 50%)
- Count category of existing values
- Replace existing values with the 1

In [21]:
# Orignal data
if (data['Id video youtube'].isnull().sum()/len(data)) > 50:
    print('meaningless column')
else:
    print('Id video youtube')
    print('Missing values are less than a half. ')
    print('Totally', str(len(data['Id video youtube'].unique())-1), 'different values.')

Id video youtube
Missing values are less than a half. 
Totally 94 different values.


In [22]:
data['Id video youtube'].loc[~data['Id video youtube'].isnull()] = 1

data['Id video youtube'].fillna(-1, inplace=True)

data['Id video youtube'].unique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


array([-1,  1], dtype=int64)

## Column facebook
- Categorical
- Replace nan with the -1 (missing values < 50%)
- Count category of existing values
- Replace existing values with the 1

In [23]:
# Orignal data
if (data['facebook'].isnull().sum()/len(data)) > 50:
    print('meaningless column')
else:
    print('facebook')
    print('Missing values are less than a half. ')
    print('Totally', str(len(data['facebook'].unique())-1), 'different values.')

facebook
Missing values are less than a half. 
Totally 197 different values.


In [24]:
data['facebook'].loc[~data['facebook'].isnull()] = 1

data['facebook'].fillna(-1, inplace=True)

data['facebook'].unique()

array([-1,  1], dtype=int64)

## Column twitter
- Categorical
- Replace nan with the -1 (missing values < 50%)
- Count category of existing values
- Replace existing values with the 1

In [25]:
# Orignal data
if (data['twitter'].isnull().sum()/len(data)) > 50:
    print('meaningless column')
else:
    print('twitter')
    print('Missing values are less than a half. ')
    print('Totally', str(len(data['twitter'].unique())-1), 'different values.')

twitter
Missing values are less than a half. 
Totally 193 different values.


In [26]:
data['twitter'].loc[~data['twitter'].isnull()] = 1

data['twitter'].fillna(-1, inplace=True)

data['twitter'].unique()

array([-1,  1], dtype=int64)

## Column linkedin
- Categorical
- Replace nan with the -1 (missing values < 50%)
- Count category of existing values
- Replace existing values with the 1

In [27]:
# Orignal data
if (data['linkedin'].isnull().sum()/len(data)) > 50:
    print('meaningless column')
else:
    print('linkedin')
    print('Missing values are less than a half. ')
    print('Totally', str(len(data['linkedin'].unique())-1), 'different values.')

linkedin
Missing values are less than a half. 
Totally 320 different values.


In [28]:
data['linkedin'].loc[~data['linkedin'].isnull()] = 1

data['linkedin'].fillna(-1, inplace=True)

data['linkedin'].unique()

array([ 1, -1], dtype=int64)

# Column preprocessing - Type 4
## Column Id video filter_types_X, filter_clients_X, filter_services_X, filter_secteurs_X, filter_tech_X & users_X
- Binary: Specific string or nan
- Replace nan with the -1
- Replace existing values (the specific string) with the 1

In [29]:
# colomn selection
filter_col = [col for col in data if col.startswith(('filter_types_', 'filter_clients_', 'filter_services_', 'filter_secteurs_', 
                                                    'filter_tech_', 'users_'))]
filter_col

['filter_types_1',
 'filter_types_2',
 'filter_types_3',
 'filter_clients_1',
 'filter_clients_2',
 'filter_clients_3',
 'filter_clients_4',
 'filter_clients_5',
 'filter_clients_6',
 'filter_services_1',
 'filter_services_2',
 'filter_services_3',
 'filter_services_4',
 'filter_services_5',
 'filter_services_6',
 'filter_services_7',
 'filter_services_8',
 'filter_services_9',
 'filter_secteurs_1',
 'filter_secteurs_2',
 'filter_secteurs_3',
 'filter_tech_1',
 'filter_tech_2',
 'filter_tech_3',
 'filter_tech_4',
 'filter_tech_5',
 'filter_tech_6',
 'filter_tech_7',
 'filter_tech_8',
 'filter_tech_9',
 'filter_tech_10',
 'filter_tech_11',
 'filter_tech_12',
 'filter_tech_13',
 'users_1',
 'users_2',
 'users_3',
 'users_4',
 'users_5',
 'users_6',
 'users_7',
 'users_8']

In [30]:
for col in filter_col:
    if len(data[col].unique()) != 2:
        print(col)
        break
    else:
        data[col].loc[~data[col].isnull()] = 1
        data[col].fillna(-1, inplace=True)

## Column Code postal, A levé des fonds, Souhaite levé des fonds, Récompensé

### Keep the original data
- Categorical

In [31]:
data['Code postal'].unique()

array([75014, 75008, 75018, 75003, 69003, 95880, 75016, 92270, 49070,
       75010, 75000, 75020, 75017, 78100, 44300, 31250, 92410, 95100,
       75005, 95000, 78300, 75009, 75019, 38600, 34170, 34000, 92300,
       94270, 92100, 14200, 75001, 86000, 75011, 95170, 45000, 75002,
       67300, 69001, 75013, 74940,  6560, 53260, 31500, 30000, 94250,
       92200, 75015, 79270, 85000, 69006, 75012, 77420, 69002, 35000,
       17000, 59110, 44800, 16000,  2491, 83520, 75006, 38000, 59000,
       77000, 92120, 93100, 35400, 59420, 21600, 94000, 84000, 69004,
       20200,  6300, 72230, 91440, 91190, 13100, 59700, 35510, 93310,
       44000, 92130, 92700, 42400, 92000, 29630, 83000, 35700, 62790,
       64210, 59100, 78000, 33360, 43000, 31240, 93300, 37000, 94300,
       17220, 78110, 59200, 91200, 92150, 16160, 74100, 33300, 92600,
       91080, 63000, 92210, 64000, 44240, 13090, 75007, 78200, 67100,
       53000, 30900, 44200, 69100, 75004, 69009,  6000, 34400, 59650,
       64400, 83400,

In [43]:
#orignal data
data['A levé des fonds'].unique()

array([1, 0], dtype=int64)

In [33]:
#orignal data
data['Souhaite levé des fonds'].unique()

array([0, 1], dtype=int64)

In [34]:
#orignal data
data['Récompensé'].unique()

array([0, 1], dtype=int64)

# Look at the clean data
- Head;
- Name of columns;
- Type of columns
- Number of columns with miss values
- Describe.

### Head

In [35]:
data.head(4)

Unnamed: 0,ID,creation,Nbr employes,employeesref,Code postal,Région,A levé des fonds,Souhaite levé des fonds,Récompensé,Id video youtube,...,filter_tech_12,filter_tech_13,users_1,users_2,users_3,users_4,users_5,users_6,users_7,users_8
0,2,2011,"[1, 5]",4.0,75014,IDF,1,0,0,-1,...,-1,-1,-1,-1,-1,-1,-1,1,1,1
1,3,2010,"[41, 80]",5.0,75008,IDF,1,0,0,-1,...,-1,-1,-1,-1,1,1,1,1,-1,-1
2,4,2012,"[1, 5]",3.0,75018,IDF,1,0,1,-1,...,-1,-1,-1,-1,1,1,1,1,-1,1
3,6,2015,"[1, 5]",1.0,75003,IDF,1,0,1,-1,...,-1,-1,-1,-1,1,1,1,1,-1,-1


### Name of columns

In [36]:
data.columns

Index(['ID', 'creation', 'Nbr employes', 'employeesref', 'Code postal',
       'Région', 'A levé des fonds', 'Souhaite levé des fonds', 'Récompensé',
       'Id video youtube', 'internationale', 'facebook', 'twitter', 'linkedin',
       'type', 'filter_types_1', 'filter_types_2', 'filter_types_3',
       'filter_clients_1', 'filter_clients_2', 'filter_clients_3',
       'filter_clients_4', 'filter_clients_5', 'filter_clients_6',
       'filter_services_1', 'filter_services_2', 'filter_services_3',
       'filter_services_4', 'filter_services_5', 'filter_services_6',
       'filter_services_7', 'filter_services_8', 'filter_services_9',
       'filter_secteurs_1', 'filter_secteurs_2', 'filter_secteurs_3',
       'filter_tech_1', 'filter_tech_2', 'filter_tech_3', 'filter_tech_4',
       'filter_tech_5', 'filter_tech_6', 'filter_tech_7', 'filter_tech_8',
       'filter_tech_9', 'filter_tech_10', 'filter_tech_11', 'filter_tech_12',
       'filter_tech_13', 'users_1', 'users_2', 'users_3', '

### Type of columns

In [37]:
data.dtypes

ID                           int64
creation                     int64
Nbr employes                object
employeesref               float64
Code postal                  int64
Région                      object
A levé des fonds             int64
Souhaite levé des fonds      int64
Récompensé                   int64
Id video youtube             int64
internationale             float64
facebook                     int64
twitter                      int64
linkedin                     int64
type                        object
filter_types_1               int64
filter_types_2               int64
filter_types_3               int64
filter_clients_1             int64
filter_clients_2             int64
filter_clients_3             int64
filter_clients_4             int64
filter_clients_5             int64
filter_clients_6             int64
filter_services_1            int64
filter_services_2            int64
filter_services_3            int64
filter_services_4            int64
filter_services_5   

### Number of columns with miss values

In [41]:
sum((data.isnull().sum()/len(data))!= 0)

0

### Describe

In [39]:
data.describe(include='all')

Unnamed: 0,ID,creation,Nbr employes,employeesref,Code postal,Région,A levé des fonds,Souhaite levé des fonds,Récompensé,Id video youtube,...,filter_tech_12,filter_tech_13,users_1,users_2,users_3,users_4,users_5,users_6,users_7,users_8
count,437.0,437.0,437,437.0,437.0,437,437.0,437.0,437.0,437.0,...,437.0,437.0,437.0,437.0,437.0,437.0,437.0,437.0,437.0,437.0
unique,,,7,,,14,,,,,...,,,,,,,,,,
top,,,"[1, 5]",,,IDF,,,,,...,,,,,,,,,,
freq,,,299,,,280,,,,,...,,,,,,,,,,
mean,245.789474,2012.789474,,1.977757,68278.12357,,0.501144,0.304348,0.530892,-0.569794,...,-0.87643,-0.830664,-0.775744,-0.167048,-0.061785,-0.372998,-0.116705,-0.153318,-0.107551,-0.418764
std,150.152175,5.658198,,1.103412,21514.502425,,0.500572,0.460658,0.499617,0.822729,...,0.482081,0.557413,0.631771,0.987079,0.999233,0.928896,0.994305,0.989309,0.995339,0.909136
min,2.0,1968.0,,1.0,2491.0,,0.0,0.0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,119.0,2011.0,,1.0,64000.0,,0.0,0.0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,232.0,2015.0,,1.98,75008.0,,1.0,0.0,1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
75%,377.0,2016.0,,2.0,75019.0,,1.0,1.0,1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
