# Projet - EDA

In [1]:
import json
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Sports_and_Outdoors_5.json.gz')

# Garder seulement les 100 premières lignes
df = df.head(100)

In [2]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,style,vote,image
0,5.0,True,"06 3, 2015",A180LQZBUWVOLF,32034,Michelle A,What a spectacular tutu! Very slimming.,Five Stars,1433289600,,,
1,1.0,True,"04 1, 2015",ATMFGKU5SVEYY,32034,Crystal R,What the heck? Is this a tutu for nuns? I know...,Is this a tutu for nuns?!,1427846400,,,
2,5.0,True,"01 13, 2015",A1QE70QBJ8U6ZG,32034,darla Landreth,Exactly what we were looking for!,Five Stars,1421107200,,,
3,5.0,True,"12 23, 2014",A22CP6Z73MZTYU,32034,L. Huynh,I used this skirt for a Halloween costume and ...,I liked that the elastic waist didn't dig in (...,1419292800,,,
4,4.0,True,"12 15, 2014",A22L28G8NRNLLN,32034,McKenna,This is thick enough that you can't see throug...,This is thick enough that you can't see throug...,1418601600,,,


In [3]:
from mlxtend.frequent_patterns import apriori, association_rules, fpmax
from sklearn.preprocessing import LabelEncoder

# Nettoyage des données
df_cleaned = df[['reviewerID', 'asin']]

# Encodage des données
label_encoder = LabelEncoder()
df_cleaned['reviewerID_encoded'] = label_encoder.fit_transform(df_cleaned['reviewerID'])
df_cleaned['asin_encoded'] = label_encoder.fit_transform(df_cleaned['asin'])

# Créer une colonne pour les occurrences
df_cleaned['occurrence'] = 1

# Pivoter la table pour avoir les occurrences par utilisateur et produit
pivoted_df = df_cleaned.pivot_table(index='reviewerID_encoded', columns='asin_encoded', values='occurrence', fill_value=0)

# Transformer les valeurs en binaire
binary_df = (pivoted_df > 0).astype(int) # Changer le seuil

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['reviewerID_encoded'] = label_encoder.fit_transform(df_cleaned['reviewerID'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['asin_encoded'] = label_encoder.fit_transform(df_cleaned['asin'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['occurrence'] = 1


In [4]:
# Extraction des motifs fréquents
frequent_itemsets = apriori(binary_df, min_support=0.01, use_colnames=True)

# Afficher les motifs fréquents
print("Motifs fréquents :")
print(frequent_itemsets)

Motifs fréquents :
    support itemsets
0  0.051546      (0)
1  0.226804      (1)
2  0.432990      (2)
3  0.134021      (3)
4  0.134021      (4)
5  0.030928      (5)
6  0.010309   (1, 2)




In [5]:
# Extraction des motifs fermés
closed_itemsets = fpmax(binary_df, min_support=0.01, use_colnames=True)

# Afficher les motifs fermés
print("Motifs fermés :")
print(closed_itemsets)

Motifs fermés :
    support itemsets
0  0.030928      (5)
1  0.051546      (0)
2  0.134021      (3)
3  0.134021      (4)
4  0.010309   (1, 2)




In [6]:
# Extraction des règles d'association
association_rules_df = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Afficher les règles d'association
print("Règles d'association les plus confiantes :")
print(association_rules_df.sort_values(by='confidence', ascending=False))

Règles d'association les plus confiantes :
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
Index: []


In [7]:
# Analyse des sous-populations
subpopulation_id = 'ID_DE_LA_SOUS_POPULATION' # Remplacer par l'ID de la sous-population qu'on souhaite analyser
subpopulation = df_cleaned[df_cleaned['reviewerID'] == subpopulation_id]
subpopulation_frequent_itemsets = apriori(subpopulation.drop(['reviewerID', 'asin'], axis=1), min_support=0.01, use_colnames=True)

# Afficher les motifs fréquents pour la sous-population
print("Motifs fréquents pour la sous-population spécifique :")
print(subpopulation_frequent_itemsets)

Motifs fréquents pour la sous-population spécifique :
Empty DataFrame
Columns: [support, itemsets]
Index: []


  out = np.sum(_x, axis=0) / _n_rows


In [8]:
# Motifs de compression des données
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
compressed_data = pca.fit_transform(df_cleaned.drop(['reviewerID', 'asin'], axis=1))

# Afficher les motifs de compression des données
print("Composantes principales de la compression des données :")
print(pca.components_)

Composantes principales de la compression des données :
[[ 0.9999858   0.00532993  0.        ]
 [-0.00532993  0.9999858   0.        ]]
