# Importation des bibliothèques

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import random
import time
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timezone


# Récupération des données

In [2]:
post_df = pd.read_csv("./insta_v0.csv")
post_df

Unnamed: 0,0,1,2,3
0,2024-04-22T16:00:10.000Z,140 980,“There’s something so beautiful but also overw...,https://www.instagram.com/reel/C6Eb48bxXDM/
1,2024-04-19T21:01:55.000Z,225 031,It’s a wrap.⁣\n⁣\nOn today’s #WeeklyFluff meet...,https://www.instagram.com/reel/C59RS6gyQxY/
2,2024-04-19T16:30:34.000Z,102 832,Meta AI = your virtual assistant right here on...,https://www.instagram.com/p/C58yfeWxpdr/
3,2024-04-18T16:30:09.000Z,106 756,What’s next for @jmccain24 (Jared McCain)? 👀\n...,https://www.instagram.com/reel/C56MpyPrO_3/
4,2024-04-17T16:00:41.000Z,177 924,Friendship 😂⁣\n⁣\n#InTheMoment ⁣\n⁣\nVideo by ...,https://www.instagram.com/reel/C53lOLUxA1L/
...,...,...,...,...
440,2022-09-08T15:56:57.000Z,415 399,“Jonas the homies fr”\n\n#InTheMoment\n\nReel ...,https://www.instagram.com/reel/CiQGV6ujq2l/
441,2022-09-07T15:55:50.000Z,439 686,“I’m always thinking of new ways to style my h...,https://www.instagram.com/reel/CiNhXBBD1MV/
442,2022-09-06T16:06:07.000Z,447 884,“Emotionally naked and visually embracing” — t...,https://www.instagram.com/p/CiK95P5Pqw2/
443,2022-09-02T15:55:43.000Z,845 677,Just checking my socials 🐦💙\n\nMeet @lovebirds...,https://www.instagram.com/reel/CiAoyr-jg7j/


In [3]:
def preprocess_text(text):
    # Conserver les mots, les hashtags et les mentions, séparés par des espaces
    # Utilise une expression régulière pour capturer les mots et les caractères spéciaux liés aux mots
    tokens = re.findall(r'\b\w+@\w+|\b\w+#\w+|\w+', text)
    # Rejoindre les tokens pour reformer la phrase prétraitée
    preprocessed = ' '.join(tokens)
    return preprocessed

In [4]:

post_df['hash_count'] = post_df['2'].apply(lambda x: x.count('#'))
post_df['at_count'] = post_df['2'].apply(lambda x: x.count('@'))
bio =post_df['2'].apply(preprocess_text)

In [5]:
# 1. Vectorisation avec TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(bio)

# 2. Standardisation des caractéristiques
scaler = StandardScaler(with_mean=False)  # Utiliser with_mean=False pour garder la matrice sparse
X_scaled = scaler.fit_transform(X)

# 3. PCA avec 10 composantes
pca = PCA(n_components=10)  # Spécifier 10 composantes principales
X_pca = pca.fit_transform(X_scaled.toarray())  # Nécessité de convertir en array pour la PCA

# Créer un DataFrame pour les résultats de la PCA
pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(10)])  # Création de colonnes PC1 à PC10


In [6]:
final_df = pd.concat([post_df.drop(columns=['2']), pca_df], axis=1)

In [7]:
final_df.rename(columns={'0': 'date', '1': 'likes','3': 'link'}, inplace=True)

In [8]:
# Fonction pour nettoyer les chaînes et convertir en entier
def convert_to_int(number_str):
    return int(number_str.replace('\u202F', ''))

# Appliquer la fonction à la colonne
final_df['likes'] = final_df['likes'].apply(convert_to_int)


In [9]:
# Fonction pour extraire le segment après le 3ème slash
def extract_segment(url):
    return url.split('/')[3]

# Appliquer la fonction à la colonne
final_df['link'] = final_df['link'].apply(extract_segment)


In [10]:
final_df.rename(columns={'link':'type'}, inplace=True)

In [11]:
df = final_df
# Assurez-vous que votre colonne de dates est en format datetime avec fuseau horaire UTC
df['date'] = pd.to_datetime(df['date'], utc=True)

# Extraire l'année, le mois, le jour, et l'heure
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour

# Extraire le jour de la semaine (0 = Lundi, 6 = Dimanche)
df['day_of_week'] = df['date'].dt.dayofweek

# Calculer la différence de temps en jours par rapport à la date actuelle en UTC
current_date = datetime.now(timezone.utc)
df['days_since_now'] = (current_date - df['date']).dt.days
df=df.drop(columns=['date'])



In [12]:
# Utilisation de get_dummies pour l'encodage one-hot
df_encoded = pd.get_dummies(df, columns=['type'])


In [13]:

df_encoded

Unnamed: 0,likes,hash_count,at_count,PC1,PC2,PC3,PC4,PC5,PC6,PC7,...,PC9,PC10,year,month,day,hour,day_of_week,days_since_now,type_p,type_reel
0,140980,1,4,6.219253,-3.943752,2.590930,2.015010,2.413311,1.728548,5.236998,...,1.810300,-3.601024,2024,4,22,16,0,6,False,True
1,225031,1,2,-5.024875,2.254851,-0.761831,-0.588505,0.349276,0.698657,-0.409638,...,-0.152003,-0.268175,2024,4,19,21,4,9,False,True
2,102832,0,1,-3.331946,1.051212,-0.326578,-0.143705,0.432849,0.330345,-0.838317,...,-0.178836,-0.200524,2024,4,19,16,4,9,True,False
3,106756,0,2,-4.108697,1.868776,-0.651597,-0.468464,0.222504,0.206503,-0.284422,...,-0.181601,-0.174339,2024,4,18,16,3,10,False,True
4,177924,1,2,-5.007556,2.363539,-0.810819,-0.601362,0.298395,0.633021,-0.430371,...,-0.131911,-0.232226,2024,4,17,16,2,11,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440,415399,1,2,-5.004927,2.415484,-0.838316,-0.661715,0.294735,0.652973,-0.473161,...,-0.168358,-0.252898,2022,9,8,15,3,598,False,True
441,439686,0,2,1.358718,-0.476923,-2.009674,-0.657903,-0.351051,-0.638306,0.008186,...,-0.838041,1.200214,2022,9,7,15,2,599,False,True
442,447884,0,2,17.136437,-6.953736,4.390246,-0.921999,-2.002715,-10.678643,-1.480543,...,-2.755001,17.146770,2022,9,6,16,1,600,True,False
443,845677,1,2,-4.390699,2.060700,-0.561232,-0.630244,0.573968,0.348883,-0.002253,...,-0.431710,-0.606796,2022,9,2,15,4,604,False,True


In [15]:
df_encoded.to_csv('cleaned_data.csv', index=False)  