# (03) Création des différents CSV nécessaires à l'importation des données

##### *Import des libraries nécessaires pour le nettoyage et l'exportation des différents CSV nécessaires à la création de la BDD*

In [1]:
import pandas as pd

---

## 1 - Création du CSV pour la table "Product"

#### *Document numéro 1 : Datafinity Electronics Product Data*

In [2]:
df1 = pd.read_csv("./data/DatafinitiElectronicsProductData.csv")

---

#### Nombre de lignes et colonnes

In [3]:
df1.shape

(7299, 27)

---

#### Affichage du nom des colonnes

In [4]:
df1.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'dateAdded',
       'dateUpdated', 'dimension', 'ean', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'primaryCategories', 'reviews.date',
       'reviews.dateSeen', 'reviews.doRecommend', 'reviews.numHelpful',
       'reviews.rating', 'reviews.sourceURLs', 'reviews.text', 'reviews.title',
       'reviews.username', 'sourceURLs', 'upc', 'weight'],
      dtype='object')

---

#### Suppression des colonnes non utlisées

In [5]:
df1 = df1.drop(["asins", "brand", "dimension", "ean", "keys", "manufacturer", "manufacturerNumber", 
                  "reviews.date", "reviews.dateSeen", "reviews.doRecommend", "reviews.numHelpful", "reviews.rating", 
                  "reviews.sourceURLs", "reviews.text", "reviews.title", "reviews.username", "upc", "weight"], axis=1)

In [6]:
df1.shape

(7299, 9)

---

#### Modification de la position des colonnes

In [7]:
df1 = df1[['id', 'name', 'colors', 'primaryCategories', 'categories', 'dateAdded', 'dateUpdated', 'sourceURLs', 'imageURLs']]

In [8]:
df1.columns

Index(['id', 'name', 'colors', 'primaryCategories', 'categories', 'dateAdded',
       'dateUpdated', 'sourceURLs', 'imageURLs'],
      dtype='object')

---

#### *Document numéro 2 : Datafiniti Electronics Products Pricing Data*

In [9]:
df2 = pd.read_csv("./data/DatafinitiElectronicsProductsPricingData.csv")

---

#### Nombre de lignes et colonnes

In [10]:
df2.shape

(7249, 31)

---

#### Affichage du nom des colonnes

In [11]:
df2.columns

Index(['id', 'prices.amountMax', 'prices.amountMin', 'prices.availability',
       'prices.condition', 'prices.currency', 'prices.dateSeen',
       'prices.isSale', 'prices.merchant', 'prices.shipping',
       'prices.sourceURLs', 'asins', 'brand', 'categories', 'dateAdded',
       'dateUpdated', 'ean', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'primaryCategories', 'sourceURLs', 'upc',
       'weight', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
       'Unnamed: 30'],
      dtype='object')

---

#### Suppression des colonnes non utlisées

In [12]:
df2 = df2.drop(["prices.amountMax", "prices.amountMin", "prices.availability", "prices.condition", 
                   "prices.currency", "prices.dateSeen", "prices.isSale", "prices.merchant", "prices.shipping", 
                   "prices.sourceURLs", "asins", "brand", "ean", "keys", "manufacturer", "manufacturerNumber", 
                   "upc", "weight", "Unnamed: 26", "Unnamed: 27", "Unnamed: 28", "Unnamed: 29", "Unnamed: 30"], axis=1)

In [13]:
df2.shape

(7249, 8)

---

#### Modification de la position des colonnes

In [14]:
df2 = df2[['id', 'name', 'primaryCategories', 'categories', 'dateAdded', 'dateUpdated', 'sourceURLs', 'imageURLs']]

In [15]:
df2.columns

Index(['id', 'name', 'primaryCategories', 'categories', 'dateAdded',
       'dateUpdated', 'sourceURLs', 'imageURLs'],
      dtype='object')

---

#### Concatenation des deux dataframes

In [16]:
product = df1.append(df2, ignore_index=True, sort=False)

In [17]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14548 entries, 0 to 14547
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 14548 non-null  object
 1   name               14548 non-null  object
 2   colors             5280 non-null   object
 3   primaryCategories  14548 non-null  object
 4   categories         14548 non-null  object
 5   dateAdded          14548 non-null  object
 6   dateUpdated        14548 non-null  object
 7   sourceURLs         14548 non-null  object
 8   imageURLs          14548 non-null  object
dtypes: object(9)
memory usage: 1023.0+ KB


In [18]:
product.isnull().sum()

id                      0
name                    0
colors               9268
primaryCategories       0
categories              0
dateAdded               0
dateUpdated             0
sourceURLs              0
imageURLs               0
dtype: int64

---

#### Suppression des doublons

In [19]:
product = product.drop_duplicates('id')

In [20]:
product.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 839 entries, 0 to 14537
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 839 non-null    object
 1   name               839 non-null    object
 2   colors             27 non-null     object
 3   primaryCategories  839 non-null    object
 4   categories         839 non-null    object
 5   dateAdded          839 non-null    object
 6   dateUpdated        839 non-null    object
 7   sourceURLs         839 non-null    object
 8   imageURLs          839 non-null    object
dtypes: object(9)
memory usage: 65.5+ KB


---

#### Remplacement des valeurs NULL dans "colors" par "Inconnue"

In [21]:
product["colors"].fillna("Inconnue", inplace = True)

In [22]:
product.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 839 entries, 0 to 14537
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 839 non-null    object
 1   name               839 non-null    object
 2   colors             839 non-null    object
 3   primaryCategories  839 non-null    object
 4   categories         839 non-null    object
 5   dateAdded          839 non-null    object
 6   dateUpdated        839 non-null    object
 7   sourceURLs         839 non-null    object
 8   imageURLs          839 non-null    object
dtypes: object(9)
memory usage: 65.5+ KB


---

#### Modification de la position des colonnes, des noms, puis export en CSV

In [23]:
product = product[["id", "name", "colors", "primaryCategories", "categories", "dateAdded", "dateUpdated", "sourceURLs", "imageURLs"]]

In [24]:
product.rename(columns={"id": "product_id"}, inplace=True)

In [25]:
product.to_csv('./CSV_Tables/product.csv', sep=',', encoding='utf-8', index=False)

---

## 2 - Création du CSV pour la table "Manufacturer"

#### *Document numéro 1 : Datafinity Electronics Product Data*

In [26]:
df3 = pd.read_csv("./data/DatafinitiElectronicsProductData.csv")

---

#### Affichage du nom des colonnes

In [27]:
df3.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'dateAdded',
       'dateUpdated', 'dimension', 'ean', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'primaryCategories', 'reviews.date',
       'reviews.dateSeen', 'reviews.doRecommend', 'reviews.numHelpful',
       'reviews.rating', 'reviews.sourceURLs', 'reviews.text', 'reviews.title',
       'reviews.username', 'sourceURLs', 'upc', 'weight'],
      dtype='object')

---

#### Suppression des colonnes non utlisées

In [28]:
df3 = df3.drop(["asins", "categories", "colors", "dateAdded", "dateUpdated", 
                "dimension", "ean", "imageURLs", "keys", "name", "primaryCategories", 
                "reviews.date", "reviews.dateSeen", "reviews.doRecommend", "reviews.numHelpful", 
                "reviews.rating", "reviews.sourceURLs", "reviews.text", "reviews.title", 
                "reviews.username", "sourceURLs", "upc", "weight"], axis=1)

In [29]:
df3.columns

Index(['id', 'brand', 'manufacturer', 'manufacturerNumber'], dtype='object')

---

#### *Document numéro 2 : Datafiniti Electronics Products Pricing Data*

In [30]:
df4 = pd.read_csv("./data/DatafinitiElectronicsProductsPricingData.csv")

---

#### Affichage du nom des colonnes

In [31]:
df4.columns

Index(['id', 'prices.amountMax', 'prices.amountMin', 'prices.availability',
       'prices.condition', 'prices.currency', 'prices.dateSeen',
       'prices.isSale', 'prices.merchant', 'prices.shipping',
       'prices.sourceURLs', 'asins', 'brand', 'categories', 'dateAdded',
       'dateUpdated', 'ean', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'primaryCategories', 'sourceURLs', 'upc',
       'weight', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
       'Unnamed: 30'],
      dtype='object')

---

#### Suppression des colonnes non utlisées

In [32]:
df4 = df4.drop(["prices.amountMax", "prices.amountMin", "prices.availability", "prices.condition", 
                "prices.currency", "prices.dateSeen", "prices.isSale", "prices.merchant", "prices.shipping", 
                "prices.sourceURLs", "asins", "categories", "dateAdded", "dateUpdated", "ean", "imageURLs", "keys", 
                "name", "primaryCategories", "sourceURLs", "upc", "weight", "Unnamed: 26", "Unnamed: 27", "Unnamed: 28", 
                "Unnamed: 29", "Unnamed: 30"], axis=1)

In [33]:
df4.columns

Index(['id', 'brand', 'manufacturer', 'manufacturerNumber'], dtype='object')

---

#### Concatenation des deux dataframes

In [34]:
manufacturer = df3.append(df4, ignore_index=True, sort=False)

In [35]:
manufacturer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14548 entries, 0 to 14547
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  14548 non-null  object
 1   brand               14548 non-null  object
 2   manufacturer        7867 non-null   object
 3   manufacturerNumber  14548 non-null  object
dtypes: object(4)
memory usage: 454.8+ KB


---

#### Suppression des doublons

In [36]:
manufacturer = manufacturer.drop_duplicates()

In [37]:
manufacturer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 849 entries, 0 to 14537
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  849 non-null    object
 1   brand               849 non-null    object
 2   manufacturer        366 non-null    object
 3   manufacturerNumber  849 non-null    object
dtypes: object(4)
memory usage: 33.2+ KB


---

#### Remplacement des valeurs NULL dans "manufacturer" par "Inconnu"

In [38]:
manufacturer["manufacturer"].fillna("Inconnu", inplace = True)

In [39]:
manufacturer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 849 entries, 0 to 14537
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  849 non-null    object
 1   brand               849 non-null    object
 2   manufacturer        849 non-null    object
 3   manufacturerNumber  849 non-null    object
dtypes: object(4)
memory usage: 33.2+ KB


---

#### Modification de la position des colonnes, des noms, puis export en CSV

In [40]:
manufacturer = manufacturer[['manufacturerNumber', 'manufacturer', 'brand', 'id']]

In [41]:
manufacturer.rename(columns={"id": "product_id"}, inplace=True)

In [42]:
manufacturer.to_csv('./CSV_Tables/manufacturer.csv', sep=',', encoding='utf-8', index=False)

---

## 3 - Création du CSV pour la table "Logistics"

#### *Document numéro 1 : Datafinity Electronics Product Data*

In [43]:
df5 = pd.read_csv("./data/DatafinitiElectronicsProductData.csv")

---

#### Affichage du nom des colonnes

In [44]:
df5.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'dateAdded',
       'dateUpdated', 'dimension', 'ean', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'primaryCategories', 'reviews.date',
       'reviews.dateSeen', 'reviews.doRecommend', 'reviews.numHelpful',
       'reviews.rating', 'reviews.sourceURLs', 'reviews.text', 'reviews.title',
       'reviews.username', 'sourceURLs', 'upc', 'weight'],
      dtype='object')

---

#### Suppression des colonnes non utlisées

In [45]:
df5 = df5.drop(["brand", "categories", "colors", "dateAdded", "dateUpdated", "imageURLs", 
                "manufacturer", "manufacturerNumber", "name", "primaryCategories", "reviews.date", 
                "reviews.dateSeen", "reviews.doRecommend", "reviews.numHelpful", "reviews.rating", 
                "reviews.sourceURLs", "reviews.text", "reviews.title", "reviews.username", "sourceURLs"], axis=1)

In [46]:
df5.columns

Index(['id', 'asins', 'dimension', 'ean', 'keys', 'upc', 'weight'], dtype='object')

---

#### *Document numéro 2 : Datafiniti Electronics Products Pricing Data*

In [47]:
df6 = pd.read_csv("./data/DatafinitiElectronicsProductsPricingData.csv")

---

#### Affichage du nom des colonnes

In [48]:
df6.columns

Index(['id', 'prices.amountMax', 'prices.amountMin', 'prices.availability',
       'prices.condition', 'prices.currency', 'prices.dateSeen',
       'prices.isSale', 'prices.merchant', 'prices.shipping',
       'prices.sourceURLs', 'asins', 'brand', 'categories', 'dateAdded',
       'dateUpdated', 'ean', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'primaryCategories', 'sourceURLs', 'upc',
       'weight', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
       'Unnamed: 30'],
      dtype='object')

---

#### Suppression des colonnes non utlisées

In [49]:
df6 = df6.drop(['prices.amountMax', 'prices.amountMin', 'prices.availability','prices.condition', 
                'prices.currency', 'prices.dateSeen','prices.isSale', 'prices.merchant', 'prices.shipping',
                'prices.sourceURLs', 'brand', 'categories', 'dateAdded','dateUpdated', 'imageURLs', 'manufacturer',
                'manufacturerNumber', 'name', 'primaryCategories', 'sourceURLs', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 
                'Unnamed: 29', 'Unnamed: 30'], axis=1)

In [50]:
df6.columns

Index(['id', 'asins', 'ean', 'keys', 'upc', 'weight'], dtype='object')

---

#### Concatenation des deux dataframes

In [51]:
logistics = df5.append(df6, ignore_index=True, sort=False)

In [52]:
logistics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14548 entries, 0 to 14547
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         14548 non-null  object
 1   asins      14548 non-null  object
 2   dimension  6090 non-null   object
 3   ean        4494 non-null   object
 4   keys       14548 non-null  object
 5   upc        14548 non-null  object
 6   weight     14548 non-null  object
dtypes: object(7)
memory usage: 795.7+ KB


---

#### Suppression des doublons

In [53]:
logistics = logistics.drop_duplicates()

In [54]:
logistics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 885 entries, 0 to 14537
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         885 non-null    object
 1   asins      885 non-null    object
 2   dimension  34 non-null     object
 3   ean        169 non-null    object
 4   keys       885 non-null    object
 5   upc        885 non-null    object
 6   weight     885 non-null    object
dtypes: object(7)
memory usage: 55.3+ KB


---

#### Remplacement des valeurs NULL dans "dimension" par "Inconnue"

In [55]:
logistics["dimension"].fillna("Inconnue", inplace = True)

In [56]:
logistics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 885 entries, 0 to 14537
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         885 non-null    object
 1   asins      885 non-null    object
 2   dimension  885 non-null    object
 3   ean        169 non-null    object
 4   keys       885 non-null    object
 5   upc        885 non-null    object
 6   weight     885 non-null    object
dtypes: object(7)
memory usage: 55.3+ KB


---

#### Modification de la position des colonnes, des noms, puis export en CSV

In [57]:
logistics = logistics[['asins', 'ean', 'upc', 'dimension', 'weight', 'keys', 'id']]

In [58]:
logistics.rename(columns={"id": "product_id"}, inplace=True)

In [59]:
logistics.to_csv('./CSV_Tables/logistics.csv', sep=',', encoding='utf-8', index=False)

---

## 4 - Création du CSV pour la table "Reviews"

#### *Document numéro 1 : Datafinity Electronics Product Data*

In [60]:
df7 = pd.read_csv("./data/DatafinitiElectronicsProductData.csv")

---

#### Affichage du nom des colonnes

In [61]:
df7.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'dateAdded',
       'dateUpdated', 'dimension', 'ean', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'primaryCategories', 'reviews.date',
       'reviews.dateSeen', 'reviews.doRecommend', 'reviews.numHelpful',
       'reviews.rating', 'reviews.sourceURLs', 'reviews.text', 'reviews.title',
       'reviews.username', 'sourceURLs', 'upc', 'weight'],
      dtype='object')

---

#### Suppression des colonnes non utlisées

In [62]:
df7 = df7.drop(["asins", "brand", "categories", "colors", "dateAdded", "dateUpdated", "dimension", "ean", 
                "imageURLs", "keys", "manufacturer","manufacturerNumber", "name", "primaryCategories", 
                "sourceURLs", "upc", "weight"], axis=1)

In [63]:
df7.columns

Index(['id', 'reviews.date', 'reviews.dateSeen', 'reviews.doRecommend',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.username'],
      dtype='object')

---

#### *Document numéro 2 : Datafiniti Electronics Products Pricing Data*

In [64]:
df8 = pd.read_csv("./data/DatafinitiElectronicsProductsPricingData.csv")

---

#### Affichage du nom des colonnes

In [65]:
df8.columns

Index(['id', 'prices.amountMax', 'prices.amountMin', 'prices.availability',
       'prices.condition', 'prices.currency', 'prices.dateSeen',
       'prices.isSale', 'prices.merchant', 'prices.shipping',
       'prices.sourceURLs', 'asins', 'brand', 'categories', 'dateAdded',
       'dateUpdated', 'ean', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'primaryCategories', 'sourceURLs', 'upc',
       'weight', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
       'Unnamed: 30'],
      dtype='object')

---

#### Suppression des colonnes non utlisées

In [66]:
df8 = df8.drop(["prices.amountMax", "prices.amountMin", "prices.availability", "prices.condition", "prices.currency", 
                "prices.dateSeen", "prices.isSale", "prices.merchant", "prices.shipping", "prices.sourceURLs", "asins", 
                "brand", "categories", "dateAdded", "dateUpdated", "ean", "imageURLs", "keys", "manufacturer", "manufacturerNumber", 
                "name", "primaryCategories", "sourceURLs", "upc", "weight", "Unnamed: 26", "Unnamed: 27", "Unnamed: 28", "Unnamed: 29",
                "Unnamed: 30"], axis=1)

In [67]:
df8.columns

Index(['id'], dtype='object')

---

#### Concatenation des deux dataframes

In [68]:
reviews = df7.append(df8, ignore_index=True, sort=False)

In [69]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14548 entries, 0 to 14547
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   14548 non-null  object 
 1   reviews.date         7238 non-null   object 
 2   reviews.dateSeen     7299 non-null   object 
 3   reviews.doRecommend  5908 non-null   object 
 4   reviews.numHelpful   5813 non-null   float64
 5   reviews.rating       7135 non-null   float64
 6   reviews.sourceURLs   7299 non-null   object 
 7   reviews.text         7294 non-null   object 
 8   reviews.title        7295 non-null   object 
 9   reviews.username     7299 non-null   object 
dtypes: float64(2), object(8)
memory usage: 1.1+ MB


---

#### Suppression des doublons

In [70]:
reviews = reviews.drop_duplicates()

In [71]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8134 entries, 0 to 14537
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   8134 non-null   object 
 1   reviews.date         7238 non-null   object 
 2   reviews.dateSeen     7299 non-null   object 
 3   reviews.doRecommend  5908 non-null   object 
 4   reviews.numHelpful   5813 non-null   float64
 5   reviews.rating       7135 non-null   float64
 6   reviews.sourceURLs   7299 non-null   object 
 7   reviews.text         7294 non-null   object 
 8   reviews.title        7295 non-null   object 
 9   reviews.username     7299 non-null   object 
dtypes: float64(2), object(8)
memory usage: 699.0+ KB


---

#### Modification de la position des colonnes, des noms, puis export en CSV

In [72]:
reviews = reviews[["id", "reviews.username", "reviews.date", "reviews.title", "reviews.text", "reviews.rating", 
                   "reviews.doRecommend", "reviews.numHelpful", "reviews.dateSeen", "reviews.sourceURLs"]]

In [73]:
reviews.rename(columns={"id": "product_id", "reviews.username": "username", "reviews.date": "date", "reviews.title": "title", 
                        "reviews.text": "text", "reviews.rating": "rating", "reviews.doRecommend": "doRecommend", 
                        "reviews.numHelpful": "numHelpful", "reviews.dateSeen": "dateSeen", "reviews.sourceURLs": "sourceURLs"},
               inplace=True)

Autre méthode : 
```sql 
reviews.columns.str.replace("reviews.", " ")
```


In [74]:
reviews.rating.unique()

array([ 5.,  4.,  3.,  2.,  1., nan])

In [75]:
reviews.to_csv('./CSV_Tables/reviews.csv', sep=',', encoding='utf-8', index=False)

---

## 5 - Création du CSV pour la table "Prices"

#### *Document numéro 1 : Datafinity Electronics Product Data*

In [76]:
df9 = pd.read_csv("./data/DatafinitiElectronicsProductData.csv")

---

#### Affichage du nom des colonnes

In [77]:
df9.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'dateAdded',
       'dateUpdated', 'dimension', 'ean', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'primaryCategories', 'reviews.date',
       'reviews.dateSeen', 'reviews.doRecommend', 'reviews.numHelpful',
       'reviews.rating', 'reviews.sourceURLs', 'reviews.text', 'reviews.title',
       'reviews.username', 'sourceURLs', 'upc', 'weight'],
      dtype='object')

---

#### Suppression des colonnes non utlisées

In [78]:
df9 = df9.drop(["asins", "brand", "categories", "colors", "dateAdded", "dateUpdated", "dimension", 
                "ean", "imageURLs", "keys", "manufacturer", "manufacturerNumber", "name", "primaryCategories", 
                "reviews.date", "reviews.dateSeen", "reviews.doRecommend", "reviews.numHelpful", "reviews.rating", 
                "reviews.sourceURLs", "reviews.text", "reviews.title", "reviews.username", "sourceURLs", "upc", "weight"], axis=1)

In [79]:
df9.columns

Index(['id'], dtype='object')

---

#### *Document numéro 2 : Datafiniti Electronics Products Pricing Data*

In [80]:
df10 = pd.read_csv("./data/DatafinitiElectronicsProductsPricingData.csv")

---

#### Affichage du nom des colonnes

In [81]:
df10.columns

Index(['id', 'prices.amountMax', 'prices.amountMin', 'prices.availability',
       'prices.condition', 'prices.currency', 'prices.dateSeen',
       'prices.isSale', 'prices.merchant', 'prices.shipping',
       'prices.sourceURLs', 'asins', 'brand', 'categories', 'dateAdded',
       'dateUpdated', 'ean', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'primaryCategories', 'sourceURLs', 'upc',
       'weight', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
       'Unnamed: 30'],
      dtype='object')

---

#### Suppression des colonnes non utlisées

In [82]:
df10 = df10.drop(["asins", "brand", "categories", "dateAdded","dateUpdated", "ean", 
                  "imageURLs", "keys", "manufacturer", "manufacturerNumber", "name", 
                  "primaryCategories", "sourceURLs", "upc", "weight", "Unnamed: 26", 
                  "Unnamed: 27", "Unnamed: 28", "Unnamed: 29", "Unnamed: 30"], axis=1)

In [83]:
df10.columns

Index(['id', 'prices.amountMax', 'prices.amountMin', 'prices.availability',
       'prices.condition', 'prices.currency', 'prices.dateSeen',
       'prices.isSale', 'prices.merchant', 'prices.shipping',
       'prices.sourceURLs'],
      dtype='object')

---

#### Concatenation des deux dataframes

In [84]:
prices = df9.append(df10, ignore_index=True, sort=False)

In [85]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14548 entries, 0 to 14547
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   14548 non-null  object 
 1   prices.amountMax     7249 non-null   float64
 2   prices.amountMin     7249 non-null   float64
 3   prices.availability  7249 non-null   object 
 4   prices.condition     7249 non-null   object 
 5   prices.currency      7249 non-null   object 
 6   prices.dateSeen      7249 non-null   object 
 7   prices.isSale        7249 non-null   object 
 8   prices.merchant      7249 non-null   object 
 9   prices.shipping      4277 non-null   object 
 10  prices.sourceURLs    7249 non-null   object 
dtypes: float64(2), object(9)
memory usage: 1.2+ MB


---

#### Suppression des doublons

In [86]:
prices = prices.drop_duplicates()

In [87]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7299 entries, 0 to 14547
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   7299 non-null   object 
 1   prices.amountMax     7249 non-null   float64
 2   prices.amountMin     7249 non-null   float64
 3   prices.availability  7249 non-null   object 
 4   prices.condition     7249 non-null   object 
 5   prices.currency      7249 non-null   object 
 6   prices.dateSeen      7249 non-null   object 
 7   prices.isSale        7249 non-null   object 
 8   prices.merchant      7249 non-null   object 
 9   prices.shipping      4277 non-null   object 
 10  prices.sourceURLs    7249 non-null   object 
dtypes: float64(2), object(9)
memory usage: 684.3+ KB


---

#### Modification de la position des colonnes, des noms, puis export en CSV

In [88]:
prices = prices[["prices.amountMax", "prices.amountMin", "prices.availability", "prices.condition", "prices.currency", 
                 "prices.dateSeen", "prices.isSale", "prices.merchant", "prices.shipping", "prices.sourceURLs", "id"]]

In [89]:
prices.rename(columns={"prices.amountMax": "amountMax", "prices.amountMin": "amountMin", "prices.availability": "availability", 
                       "prices.condition": "condition", "prices.currency": "currency", "prices.dateSeen": "dateSeen", 
                       "prices.isSale": "isSale", "prices.Merchant": "Merchant", "prices.shipping": "shipping", 
                       "prices.sourceURLs": "sourceURLs", "id": "product_id"}, inplace=True)

In [90]:
prices.to_csv('./CSV_Tables/prices.csv', sep=',', encoding='utf-8', index=False)

---