Projet - Analyse explo

In [None]:
# Chargement des librairies nécessaires
library(ggplot2)
library(tidyverse)
library(gridExtra)
library(GGally)
library(plotly)
library(corrplot)
library(reshape2)
library(FactoMineR) 
library(factoextra)
library(glmnet) 
library(ggfortify)
library(pROC)
library(ROCR)
library(repr)

In [None]:
options(repr.plot.width=15, repr.plot.height=8)


In [None]:
# Lecture des données
# path="http://www.math.univ-toulouse.fr/~besse/Wikistat/data/"
path <- ""
gym <- read.table(paste(path, "gym_members_exercise_tracking.csv", sep = ""),
                    sep = ",", header = TRUE)
# Premières lignes du jeu de données
head(gym)
# Vérification du contenu
summary(gym)

In [None]:
gym[,'Gender'] <- as.factor(gym[,'Gender'])
gym[,'Workout_Type'] <- as.factor(gym[,'Workout_Type'])
gym[,'Experience_Level'] <- as.factor(gym[,'Experience_Level'])
gym[,'Workout_Frequency..days.week.'] <- as.factor(gym[,'Workout_Frequency..days.week.'])

In [None]:
summary(gym)

In [None]:
# Age	Gender	Weight..kg.	Height..m.	Max_BPM	Avg_BPM	Resting_BPM	Session_Duration..hours.	Calories_Burned	Workout_Type	Fat_Percentage	Water_Intake..liters.	Workout_Frequency..days.week.	Experience_Level	BMI
options(repr.plot.width=20, repr.plot.height=10)

g1<- ggplot(gym, aes(Age)) + geom_histogram( fill = "blue", color = "black") + labs(title = "Histogramme de l'âge des membres du gym", x = "Age", y = "Nombre de membres")
g2<- ggplot(gym, aes(Weight..kg.)) + geom_histogram( fill = "blue", color = "black") + labs(title = "Histogramme du poids des membres du gym", x = "Poids", y = "Nombre de membres")
g3<- ggplot(gym, aes(Height..m.)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme de la taille des membres du gym", x = "Taille", y = "Nombre de membres")
g4 <- ggplot(gym, aes(Max_BPM)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme du BPM max des membres du gym", x = "BPM max", y = "Nombre de membres")
g5 <- ggplot(gym, aes(Avg_BPM)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme du BPM moyen des membres du gym", x = "BPM moyen", y = "Nombre de membres")
g6 <- ggplot(gym, aes(Resting_BPM)) + geom_histogram(bins=15, fill = "blue", color = "black") + labs(title = "Histogramme du BPM de repos des membres du gym", x = "BPM de repos", y = "Nombre de membres")
g7 <- ggplot(gym, aes(Session_Duration..hours.)) + geom_histogram( fill = "blue", color = "black") + labs(title = "Histogramme de la durée des sessions des membres du gym", x = "Durée de session", y = "Nombre de membres")
g8 <- ggplot(gym, aes(Calories_Burned)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme des calories brûlées par les membres du gym", x = "Calories brûlées", y = "Nombre de membres")
g9 <- ggplot(gym, aes(Fat_Percentage)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme du pourcentage de graisse des membres du gym", x = "Pourcentage de graisse", y = "Nombre de membres")
g10 <- ggplot(gym, aes(Water_Intake..liters.)) + geom_histogram(bins=10, fill = "red", color = "black") + labs(title = "Histogramme de la consommation d'eau des membres du gym", x = "Consommation d'eau", y = "Nombre de membres")
g12 <- ggplot(gym, aes(BMI)) + geom_histogram( fill = "blue", color = "black") + labs(title = "Histogramme de l'IMC des membres du gym", x = "IMC", y = "Nombre de membres")

grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g12, ncol = 4)
rm(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10,  g12)

## Transformation de variables

In [None]:
gym[, "SBMI"] <- sqrt(gym[,"BMI"])
gym[, "LBMI"] <- log(gym[,"BMI"])
gym[, "LWeight"] <- log(gym[,"Weight..kg."])
gym[, "SFat_Percentage"] <- sqrt(gym[,"Fat_Percentage"])

In [None]:
# Tracer les histogrammes de IMC Weight..kg. Fat_Percentage et les transformations
options(repr.plot.width=20, repr.plot.height=10)
g1 <- ggplot(gym, aes(BMI)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme de l'IMC des membres du gym", x = "IMC", y = "Nombre de membres")
g2 <- ggplot(gym, aes(SBMI)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme de la racine carrée de l'IMC des membres du gym", x = "Racine carrée de l'IMC", y = "Nombre de membres")
g3 <- ggplot(gym, aes(LBMI)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme du logarithme de l'IMC des membres du gym", x = "Logarithme de l'IMC", y = "Nombre de membres")
g4 <- ggplot(gym, aes(Weight..kg.)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme du poids des membres du gym", x = "Poids", y = "Nombre de membres")
g5 <- ggplot(gym, aes(LWeight)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme de la racine carrée du poids des membres du gym", x = "Logarithme du poids", y = "Nombre de membres")
g6 <- ggplot(gym, aes(Fat_Percentage)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme du pourcentage de graisse des membres du gym", x = "Pourcentage de graisse", y = "Nombre de membres")
g7 <- ggplot(gym, aes(SFat_Percentage)) + geom_histogram(fill = "blue", color = "black") + labs(title = "Histogramme de la racine carrée du pourcentage de graisse des membres du gym", x = "Racine carrée du pourcentage de graisse", y = "Nombre de membres")

grid.arrange(g1, g2, g3, g4, g5, g6, g7, ncol = 4)
rm(g1, g2, g3, g4, g5, g6, g7)


In [None]:
gym[, "Weight..kg."] <- log(gym[,"Weight..kg."])
gym[, "BMI"] <- log(gym[,"BMI"])

# renome les variables Weight..kg. et BMI en LWeight et LBMI
names(gym)[names(gym) == "Weight..kg."] <- "LWeight"
names(gym)[names(gym) == "BMI"] <- "LBMI"

# supprimer les colonnes 16 à 19 
gym <- gym[, -c(16:20)]

In [None]:
summary(gym)

## Variables qualitatives

In [None]:
# Générer des barplots pour les variables qualitatives
q1 <- ggplot(gym, aes(x = Gender)) +
    geom_bar(fill = "lightblue", color = "darkblue") +
    labs(title = "Barplot Genre", x = "Genre", y = "Effectif")

q2 <- ggplot(gym, aes(x = Workout_Type)) +
    geom_bar(fill = "lightblue", color = "darkblue") +
    labs(title = "Barplot Type d'entraînement", x = "Type d'entraînement", y = "Effectif")

q3 <- ggplot(gym, aes(x = Experience_Level)) +
    geom_bar(fill = "lightblue", color = "darkblue") +
    labs(title = "Barplot Niveau d'expérience", x = "Niveau d'expérience", y = "Effectif")

q4 <- ggplot(gym, aes(x = Workout_Frequency..days.week.)) +
    geom_bar(fill = "lightblue", color = "darkblue") +
    labs(title = "Barplot Fréquence d'entraînement", x = "Jours par semaine", y = "Effectif")

grid.arrange(q1, q2, q3, q4, ncol = 2)

rm(q1, q2, q3, q4)


In [None]:
# Générer des mosaic plot pour les variables qualitatives

mosaicplot(table(gym$Gender, gym$Workout_Type), main = "Mosaic plot Genre x Type d'entraînement", color = c("lightblue", "gold"))
mosaicplot(table(gym$Gender, gym$Experience_Level), main = "Mosaic plot Genre x Niveau d'expérience", color = c("lightblue", "gold"))
mosaicplot(table(gym$Gender, gym$Workout_Frequency..days.week.), main = "Mosaic plot Genre x Fréquence d'entraînement", color = c("lightblue", "gold"))


In [None]:
# selectionner les variables quantitatives
gym_quanti = gym %>% select (Age, LWeight, Height..m., Max_BPM, Avg_BPM, Resting_BPM, Session_Duration..hours., Calories_Burned, Fat_Percentage, Water_Intake..liters., LBMI)

In [None]:
# ggpairs(gym_quanti)

In [None]:
corrplot(cor(gym_quanti), method = "ellipse")

## ACP

In [None]:
# acp <- PCA(gym[, c(1, 3:9, 11:12, 15)], scale.unit = TRUE, graph = FALSE, quali.sup = 2, quanti.sup = 1, ncp = 5)

acp <- PCA(gym, scale.unit = TRUE, graph = FALSE, quali.sup = c(2, 10, 13, 14) , quanti.sup = 1, ncp = 5)

# ACP réduite

# Décroissance des valeurs propres
library(factoextra)
g1<-fviz_eig(acp, addlabels = TRUE)
library(reshape2)
g2<-ggplot(melt(acp$ind$coord),aes(x=Var2,y=value))+
  geom_boxplot()+
  xlab("")
grid.arrange(g1,g2,ncol=2)

library(corrplot)
corrplot(acp$var$cor, is.corr=TRUE,method="ellipse")

In [None]:
fviz_pca_var(acp)
fviz_pca_ind(acp,col.ind="contrib",label="true",gradient.cols = c("white", "deepskyblue3", "red" ))
fviz_pca_var(acp,axes=c(1,3))
fviz_pca_ind(acp,col.ind="contrib",label="none",gradient.cols = c("white", "#2E9FDF", "#FC4E07" ),axes=c(1,3))

In [None]:
fviz_pca_ind(acp, label="none", habillage = 14, axes=c(1,2))
fviz_pca_ind(acp, label="none", habillage = 14, axes=c(1,3))
fviz_pca_ind(acp, label="none", habillage = 14, axes=c(2,3))

In [None]:
# Linear Discriminant Analysis
# Let's use Experience_Level as the target variable
library(MASS)

# Create a dataframe with the quantitative variables and the target variable
gym_lda_data <- cbind(gym_quanti, Experience_Level = gym$Experience_Level)

# Perform LDA
lda_result <- lda(Experience_Level ~ ., data = gym_lda_data)

# Summary of the LDA model
lda_result$scaling

# Plot the LDA results
plot_lda <- plot(lda_result)

# Create better visualization with ggplot2
lda_pred <- predict(lda_result, gym_lda_data)
lda_df <- data.frame(
    LD1 = lda_pred$x[,1],
    LD2 = lda_pred$x[,2],
    Experience_Level = gym_lda_data$Experience_Level
)

# Visualize the LDA results
ggplot(lda_df, aes(x = LD1, y = LD2, color = Experience_Level)) +
    geom_point(size = 3, alpha = 0.7) +
    stat_ellipse() +
    labs(title = "LDA: Discrimination par niveau d'expérience",
             x = "Première discriminante",
             y = "Deuxième discriminante") +
    theme_minimal()
