Cellule utilisée afin de vider le cache de viariables du notebook jupyter

In [1]:
%reset

In [2]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pymongo
import numpy as np 
from joblib import Parallel, delayed

# Importation des données

In [3]:
client = pymongo.MongoClient("mongodb://localhost:27017")
db = client["Tweet"]
user_collection = db["users"]

## Suppression des colonnes non utilisées lors du kmeans
On retire toutes les colonnes qui ne sont pas des attributs de nos utilisateurs.  
De plus supprime l'attribut tweet_frequency de nos données sachant qu'il est corrélé avec l'agressivité. Ce paramètre n'apporte pas d'informations supplémentaires. Une visualisation de cette corrélation peut être observée dans le script ``previewData``

In [4]:
users = list(user_collection.find({}))
users = pd.DataFrame(users)
users.index = users.user_id
users = users.drop(columns=["_id","user_id","last_tweet_published_id","tweet_ids","friends_count","followers_count","tweet_frequency"])
users.head()

Unnamed: 0_level_0,hashtag_frequency,verified,statuses_count,favourites_count,age_account,ratio_friends_followers,avg_tweet_length,nb_sensitive_tweets,Ap,visibility,ratio_sensitive_tweets,ratio_punctuation_tweets,avg_tweet_levenshtein_similarity
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
39084553,0.08,0,0.003611,0.010133,0.187854,0.000835,0.321867,0.0,0.003526,0.190383,0.0,0.069565,0.0
1564678657,0.0,0,0.003194,0.008968,0.102295,0.000587,0.103194,0.0,0.005729,0.0,0.0,0.158397,0.0
4872447178,0.07,0,0.00058,8.4e-05,0.048922,0.000718,0.309582,0.0,0.002178,0.157138,0.0,0.046377,0.276906
967244973824339968,0.0,0,0.000475,0.001242,0.006415,0.000771,0.068796,0.0,0.013588,0.03779,0.0,0.214367,0.10426
35628099,0.08,0,5.5e-05,0.000654,0.188627,0.003186,0.321867,0.0,5.3e-05,0.265963,0.0,0.069565,0.0


Function utilisée pour calculer le score de silhouette via des calculs en  parralèle afin de diminuer le temps de calcul.

In [39]:
def multi_core_kmeans_with_silhouette(data, cores=6):
    # Fit the KMeans model on the training data
    model = KMeans(n_clusters=2, n_init="auto")
    model.fit(data)

    # Define the number of subsets to create and the number of workers
    num_subsets = cores
    num_workers = cores  # Number of CPU cores

    # Get the indices of the training data
    train_indices = data.index

    # Calculate the size of each subset
    subset_size = len(train_indices) // num_subsets

    # Split the indices into subsets
    subsets = [train_indices[i * subset_size:(i + 1) * subset_size] for i in range(num_subsets)]

    # Define the function to compute silhouette score on a subset of data
    def compute_silhouette_score(data_subset):
        return silhouette_score(data.loc[data_subset], model.labels_[data_subset])

    # Initialize the parallelization framework
    pool = Parallel(n_jobs=num_workers)

    # Compute silhouette scores in parallel
    results = pool(delayed(compute_silhouette_score)(subset) for subset in subsets)

    # Aggregate the results
    return model, sum(results) / len(results)

# Clustering avec KMeans

In [7]:
model, score = multi_core_kmeans_with_silouhette(users)

print(f"Inertie :{model.inertia_:.2f}, score : {score:.2f}")

Inertie :80779.44, score : 0.44


In [8]:
users["label"] = model.labels_
users["label"].value_counts()

1    1264422
0     579017
Name: label, dtype: int64

In [9]:
users.groupby(model.labels_).mean()

Unnamed: 0,hashtag_frequency,verified,statuses_count,favourites_count,age_account,ratio_friends_followers,avg_tweet_length,nb_sensitive_tweets,Ap,visibility,ratio_sensitive_tweets,ratio_punctuation_tweets,avg_tweet_levenshtein_similarity,label
0,0.064915,0.013874,0.002735,0.006223,0.099555,0.001181,0.250444,0.00029,0.005998,0.161775,0.009884,0.076626,0.328918,0.0
1,0.057054,0.007947,0.001641,0.004575,0.097376,0.001288,0.239764,5.1e-05,0.00356,0.150426,0.009093,0.078604,0.004302,1.0


# Test des attributs

Generer toutes les combinations disponibles depuis nos attributs

In [42]:
import itertools

def generate_combinations(arr):
    combinations = []
    for r in range(6, len(arr) + 1):
        combinations.extend([list(c) for c in itertools.combinations(arr, r)])

    return combinations

attributs_combinations = generate_combinations(users.columns)
print(len(attributs_combinations), "combinaisons générées")


12911 combinaisons générées


In [41]:
%%time
users_train,_ = train_test_split(users,random_state=10,test_size=0.98)

results = {}
i = 1

for attributs in attributs_combinations:
    users_reduced = users_train[attributs]
    model,silhouette= multi_core_kmeans_with_silouhette(users_reduced)
    if silhouette > 0.8:
        results["-".join(attributs)] = {"inertie" : model.inertia_ , "silhouette_score" : silhouette}
    if i %100 == 0:
        print(i," of ",len(attributs_combinations)," combination done")
    i+=1

print(results)


100  of  15914  combination done
200  of  15914  combination done
300  of  15914  combination done
400  of  15914  combination done
500  of  15914  combination done
600  of  15914  combination done
700  of  15914  combination done
800  of  15914  combination done
900  of  15914  combination done
1000  of  15914  combination done
1100  of  15914  combination done
1200  of  15914  combination done
1300  of  15914  combination done
1400  of  15914  combination done
1500  of  15914  combination done
1600  of  15914  combination done
1700  of  15914  combination done
1800  of  15914  combination done
1900  of  15914  combination done
2000  of  15914  combination done
2100  of  15914  combination done
2200  of  15914  combination done
2300  of  15914  combination done
2400  of  15914  combination done
2500  of  15914  combination done
2600  of  15914  combination done
2700  of  15914  combination done
2800  of  15914  combination done
2900  of  15914  combination done
3000  of  15914  combin

KeyboardInterrupt: 

In [None]:
sorted_keys = sorted(results, key=lambda k: results[k]['silhouette_score'], reverse=True)

real_scores = {}
for idx,key in enumerate(sorted_keys):
    if idx > 10:
        break
    model,silhouette = multi_core_kmeans_with_silouhette(users[key.split("-")])
    real_scores[key] = {"model":model, "silhouette":silhouette}


In [35]:
print(real_scores)

{'hashtag_frequency-verified-statuses_count-favourites_count-age_account-ratio_friends_followers-nb_sensitive_tweets-Ap-ratio_sensitive_tweets-ratio_punctuation_tweets-avg_tweet_levenshtein_similarity-label': {'model': KMeans(n_clusters=2, n_init='auto'), 'silhouette': 0.825527420931964}, 'hashtag_frequency-verified-statuses_count-favourites_count-ratio_friends_followers-avg_tweet_length-nb_sensitive_tweets-Ap-ratio_sensitive_tweets-ratio_punctuation_tweets-avg_tweet_levenshtein_similarity-label': {'model': KMeans(n_clusters=2, n_init='auto'), 'silhouette': 0.8222543222150568}, 'verified-statuses_count-favourites_count-age_account-ratio_friends_followers-avg_tweet_length-nb_sensitive_tweets-Ap-ratio_sensitive_tweets-ratio_punctuation_tweets-avg_tweet_levenshtein_similarity-label': {'model': KMeans(n_clusters=2, n_init='auto'), 'silhouette': 0.8203785368272959}, 'hashtag_frequency-statuses_count-favourites_count-age_account-ratio_friends_followers-avg_tweet_length-nb_sensitive_tweets-Ap

In [None]:
original_keys = {}

for key in sorted_keys:
    components = key.split('-')
    for component in components:
        if component in original_keys.keys():
            original_keys[component] +=1
        else:
            original_keys[component] = 1


original_keys = sorted(original_keys.items(), key=lambda x: x[1],reverse=True)

print("Sorted dictionary by values:")
for key, value in original_keys:
    print(f"{key}: {value}")

In [10]:
users_reduced = users[["verified","statuses_count","favourites_count","ratio_friends_followers","nb_sensitive_tweets","Ap","ratio_sensitive_tweets","ratio_punctuation_tweets"]]

model, score = multi_core_kmeans_with_silouhette(users_reduced)

print(f"Inertie :{model.inertia_:.2f}, score : {score:.2f}")

Inertie :19144.81, score : 0.93


# Exemple pour le moment, à modifier avec le cours ACP

# Mise en place de l'ACP pour representer les resultats du clustering

In [None]:
acp = PCA(n_components=2)

In [None]:
result =acp.fit_transform(X_CR)

In [None]:
resultat = pd.DataFrame(result, columns = ["CP1","CP2"])
resultat.head()

# Presentation des clusters

In [None]:
# Create scatter plot of `PC2` vs `PC1`
fig = px.scatter(
    data_frame = resultat,
    x= "CP1",
    y = "CP2",
    color = labels,
    title = "PCA Representation of Clusters"
)
fig.update_layout(xaxis_title = "Cluster", yaxis_title ="Value" )
fig.show()