In [1]:
# Import des librairies 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

# plot en 3D 
import plotly.graph_objs as go
import plotly.express as px

# preprocessing data
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from gensim.models import Word2Vec

# clustering models
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, dendrogram

# metrics
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples

ModuleNotFoundError: No module named 'plotly'

In [None]:
# Import des données
df = pd.read_csv('data/gsearch_jobs.csv')
df.head()

## EDA

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().sum()

### NaN

In [None]:
def count_nan(df):
    
    nan_counts = df.isna().sum() # compte le nombre de NaN pour chaque colonne
    total_counts = len(df) # compte le nombre total de données dans le dataframe
    nan_percentages = (nan_counts / total_counts) * 100 # calcule le pourcentage de NaN pour chaque colonne
    result_df = pd.concat([nan_counts, nan_percentages], axis=1) # combine les deux séries en un dataframe
    result_df.columns = ['NaN Count', 'NaN Percentage'] # renomme les colonnes du nouveau dataframe
    return result_df

In [None]:
df_NaN = count_nan(df)
df_NaN = df_NaN.sort_values(by = ['NaN Count'], ascending = False)
# df_NaN = df_NaN.loc[df_NaN['NaN Count'] != 0]
df_NaN

In [None]:
# supprimer les colonnes qui ont trop de NaN
def no_NaN(df, treshold):
    
    nan_counts = df.isna().sum() # compte le nombre de NaN pour chaque colonne
    total_counts = len(df) # compte le nombre total de données dans le dataframe
    nan_percentages = (nan_counts / total_counts) * 100 # calcule le pourcentage de NaN pour chaque colonne
    nan_treshold = nan_percentages[nan_percentages.values < treshold]
    
    return df[nan_treshold.index]

In [None]:
lin = no_NaN(df, 50)
lin = lin.dropna()
lin.isnull().sum()

In [None]:
lin.shape

In [None]:
lin = lin.sample(n=2000, random_state = 42)

In [None]:
lin.shape

### duplicates

In [None]:
lin.duplicated().sum()

### Preprocessing

L'objectif ici est de préparer les données pour notre modèle de ML 

In [None]:
lin.head()

In [None]:
lin["description"].unique()[1]

In [None]:
lin["extensions"].unique()[1]

In [None]:
lin["description_tokens"].unique()[1]

Ces trois colonnes fournissent des informations importante pour notre futur modèle, mais maintenant il faut les préparer

In [None]:
df_prep = lin[["description", "extensions", "description_tokens"]]

In [None]:
for i in ["description", "extensions", "description_tokens"]:
    print(f"**********{i}************\n, {df_prep[i].unique()[1]}")

In [None]:
# clean text 
def clean_text(text): 
    # remove non_words and convert to lowercase 
    text = re.sub(r'\W+', ' ', text.lower())
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# preprocess the 3 columns
df_prep["description_clean"] = df_prep["description"].apply(clean_text)
df_prep["extensions_clean"] = df_prep["extensions"].apply(clean_text)
df_prep["description_tokens_clean"] = df_prep["description_tokens"].apply(clean_text)

In [None]:
df_prep.head()

In [None]:
for i in ["description_clean", "extensions_clean", "description_tokens_clean"]:
    print(f"**********{i}************\n, {df_prep[i].unique()[1]}")

In [None]:
# tokenize text
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df_prep["description_tokens"] = df_prep["description_clean"].apply(nltk.word_tokenize)
df_prep["description_tokens"] = df_prep["description_tokens"].apply(lambda tokens: [token for token in tokens if not token in stop_words])

df_prep["extensions_tokens"] = df_prep["extensions_clean"].apply(nltk.word_tokenize)
df_prep["extensions_tokens"] = df_prep["extensions_tokens"].apply(lambda tokens: [token for token in tokens if not token in stop_words])

df_prep["description_tokens_tokens"] = df_prep["description_tokens_clean"].apply(nltk.word_tokenize)
df_prep["description_tokens_tokens"] = df_prep["description_tokens_tokens"].apply(lambda tokens: [token for token in tokens if not token in stop_words])

In [None]:
df_prep.head()

In [None]:
df_prep.dtypes

In [None]:
for i in ["description_tokens", "extensions_tokens", "description_tokens_tokens"]:
    print(f"**********{i}************\n, {list(df_prep[i].apply(tuple).unique().tolist()[1])}")

In [None]:
# initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# define function for lemmatization
def lemmatize_text(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')
df_prep["description_tokens_lemmatized"] = df_prep["description_tokens"].apply(lemmatize_text)
df_prep["extensions_tokens_lemmatized"] = df_prep["extensions_tokens"].apply(lemmatize_text)
df_prep["description_tokens_tokens_lemmatized"] = df_prep["description_tokens_tokens"].apply(lemmatize_text)

In [None]:
df_prep.shape

In [None]:
# plot word cloud of most frequent words
all_words = [word for tokens in df_prep["description_tokens_lemmatized"] for word in tokens]
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(' '.join(all_words))
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# plot word cloud of most frequent words
all_words = [word for tokens in df_prep["extensions_tokens_lemmatized"] for word in tokens]
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(' '.join(all_words))
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# plot word cloud of most frequent words
all_words = [word for tokens in df_prep["description_tokens_tokens_lemmatized"] for word in tokens]
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(' '.join(all_words))
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
df_prep.head()

In [None]:
# create word embeddings with TF-IDF
tfidf = TfidfVectorizer()
tfidf.fit(df_prep["description_clean"])
embeddings_description = tfidf.transform(df_prep["description_clean"]).toarray()

In [None]:
embeddings_description.shape

In [None]:
tfidf.fit(df_prep["extensions_clean"])
embeddings_extensions = tfidf.transform(df_prep["extensions_clean"]).toarray()

In [None]:
tfidf.fit(df_prep["description_tokens_clean"])
embeddings_description_tokens = tfidf.transform(df_prep["description_tokens_clean"]).toarray()

In [None]:
import numpy as np
# concatenate embeddings
all_embeddings_v1 = np.concatenate([embeddings_description, embeddings_extensions], axis=1)

In [None]:
all_embeddings = np.concatenate([all_embeddings_v1, embeddings_description_tokens], axis=1)

In [None]:
all_embeddings.shape

In [None]:
# train Word2Vec model
embedding_train = Word2Vec(df_prep["description_tokens"], min_count=1, size=20)

# Get the word embeddings for each words
embeddings = []
for word in embedding_train.wv.vocab:
    embeddings.append(embedding_train.wv[word])

# Create a dataframe with the embeddings
embeddings_df = pd.DataFrame(embeddings)

In [None]:
embeddings_df

### Reduction de dimension

In [None]:
# perform t-SNE
tsne = TSNE(n_components=3, random_state=42)
embeddings_tsne = tsne.fit_transform(embeddings_df)

In [None]:
embeddings_tsne

### préparation du dataframe

In [None]:
lin.columns

In [None]:
tsne_df = pd.DataFrame(embeddings_tsne, columns=['Component 1', 'Component 2', 'Component 3'])

In [None]:
# sns.lmplot(x='Component 1',
#            y='Component 2',
#            data=tsne_df,
#            fit_reg=False,
#            legend=True,
#            height=9,
# #            hue='Label',
#            scatter_kws={"s":200, "alpha":0.3})

# plt.title('t-SNE Results: Digits', weight='bold').set_fontsize(6)
# plt.xlabel('Component 1')
# plt.ylabel('Component 2')
# plt.show()

In [None]:
trace = go.Scatter3d(
    x=tsne_df['Component 1'],
    y=tsne_df['Component 2'],
    z=tsne_df['Component 3'],
    mode='markers',
    marker=dict(
        size=5,
        colorscale='Viridis',
        opacity=0.8
    )
)

fig = go.Figure(data=[trace])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

Conclusion du t-SNE : des components pas très explicable

### Clustering : 1st model : Agglomerative Clustering

In [None]:
model_AC = AgglomerativeClustering(n_clusters=3, metric = 'l2', linkage='complete')

labels = model_AC.fit_predict(tsne_df)

In [None]:
# Calculer le coefficient de silhouette pour chaque point
silhouette_vals = silhouette_samples(tsne_df, labels)

# Calculer la silhouette moyenne pour le dataset
silhouette_avg = np.mean(silhouette_vals)

# Tracer le diagramme de silhouette
y_ticks = []
y_lower, y_upper = 0, 0
for i, cluster in enumerate(np.unique(labels)):
    cluster_silhouette_vals = silhouette_vals[labels == cluster]
    cluster_silhouette_vals.sort()
    y_upper += len(cluster_silhouette_vals)
    plt.barh(range(y_lower, y_upper), cluster_silhouette_vals, height=1)
    y_ticks.append((y_lower + y_upper) / 2.)
    y_lower += len(cluster_silhouette_vals)
sns.set_theme(context='talk', style='darkgrid', palette='deep', font='sans-serif', 
                  font_scale=1, color_codes=True, rc=None)
plt.axvline(silhouette_avg, color="red", linestyle="--")
plt.yticks(y_ticks, np.unique(labels))
plt.ylabel('Cluster')
plt.xlabel('Coefficient de silhouette')
plt.show()
print("Silhouette Score :",silhouette_avg)

In [None]:
feat1 = 'Component 1'
feat2 = 'Component 2'


# Create a DataFrame with the two features and the cluster labels
df_1 = pd.DataFrame({'x': tsne_df[feat1], 'y': tsne_df[feat2], 'label': labels})

# Get a list of the unique cluster labels
clusters = df_1['label'].unique()

# Set up the plot
fig, ax = plt.subplots()

# Plot the data points for each cluster
for cluster in clusters:
    data = df_1[df_1['label'] == cluster]
    ax.scatter(data['x'], data['y'], label=f'Cluster {cluster}')

# Add legend and axis labels
# ax.legend(labels=["Promising","G.O.A.T"])
ax.set_xlabel(feat1)
ax.set_ylabel(feat2)
ax.set_title('Agglomerative Clustering')

# Show the plot
plt.show()

In [None]:
from mpl_toolkits import mplot3d

feat1 = 'Component 1'
feat2 = 'Component 2'
feat3 = 'Component 3'

# Create a DataFrame with the three features and the cluster labels
df_1 = pd.DataFrame({'x': tsne_df[feat1], 'y': tsne_df[feat2], 'z': tsne_df[feat3], 'label': labels})

# Get a list of the unique cluster labels
clusters = df_1['label'].unique()

# Set up the plot
fig = plt.figure(figsize=(12, 7))
ax = fig.add_subplot(111, projection='3d')

# Plot the data points for each cluster
for cluster in clusters:
    data = df_1[df_1['label'] == cluster]
    ax.scatter(data['x'], data['y'], data['z'], label=f'Cluster {cluster}')

# Add legend and axis labels
# ax.legend(labels=["Promising","G.O.A.T"])
ax.set_xlabel(feat1)
ax.set_ylabel(feat2)
ax.set_zlabel(feat3)
ax.set_title('Agglomerative Clustering')

# Show the plot
plt.show()

In [None]:
# Create a DataFrame with the three features and the cluster labels
df_3d = pd.DataFrame({'x': tsne_df['Component 1'], 'y': tsne_df['Component 2'], 'z': tsne_df['Component 3'], 'label': labels})

# Create the plot
fig = px.scatter_3d(df_3d, x='x', y='y', z='z', color='label')

# Show the plot
fig.show()

### 2nd model : KMeans Clustering

In [None]:
# Perform KMeans clustering
model_kmeans = KMeans(n_clusters=3, random_state=42)
labels_kmeans = model_kmeans.fit_predict(tsne_df)

In [None]:
# # Calculer le coefficient de silhouette pour chaque point
# silhouette_vals = silhouette_samples(tsne_df, labels_kmeans)

# # Calculer la silhouette moyenne pour le dataset
# silhouette_avg = np.mean(silhouette_vals)

# # Tracer le diagramme de silhouette
# y_ticks = []
# y_lower, y_upper = 0, 0
# for i, cluster in enumerate(np.unique(labels_kmeans)):
#     cluster_silhouette_vals = silhouette_vals[labels_kmeans == cluster]
#     cluster_silhouette_vals.sort()
#     y_upper += len(cluster_silhouette_vals)
#     plt.barh(range(y_lower, y_upper), cluster_silhouette_vals, height=1)
#     y_ticks.append((y_lower + y_upper) / 2.)
#     y_lower += len(cluster_silhouette_vals)
# sns.set_theme(context='talk', style='darkgrid', palette='deep', font='sans-serif', 
#                   font_scale=1, color_codes=True, rc=None)
# plt.axvline(silhouette_avg, color="red", linestyle="--")
# plt.yticks(y_ticks, np.unique(labels))
# plt.ylabel('Cluster')
# plt.xlabel('Coefficient de silhouette')
# plt.show()
# print("Silhouette Score :",silhouette_avg)

In [None]:
# Create a DataFrame with the three features and the cluster labels
df_3d = pd.DataFrame({'x': tsne_df['Component 1'], 'y': tsne_df['Component 2'], 'z': tsne_df['Component 3'], 'label': labels_kmeans})

# Create the plot
fig = px.scatter_3d(df_3d, x='x', y='y', z='z', color='label')

# Show the plot
fig.show()