># Data Mining Project
## Discover and describe aeras of interest and events from geo-located data.
### Team:
- **SCHLEE Adam**
- **KUSNO Louis**

>## I. Initialization

### 1.1. Importing Libraries

In [None]:
# installation of required libraries and dependencies

# numeric calculations
! pip install numpy

# data frames 
! pip install pandas

# machine learning algorithms 
! pip install scikit-learn
! pip install scipy

# plotting 
! pip install matplotlib
! pip install folium
! pip install plotly
! pip install plotly-express
! pip install nbformat==5.9.2
! pip install wordcloud
! pip install branca

# natural language processing
! pip install nltk

In [None]:
# load pandas to deal with the data
import pandas as pd
# plotting
import matplotlib.pyplot as plt
# folium for maps
import folium as fl
# numpy
import numpy as np
# scaler
from sklearn.preprocessing import StandardScaler
# k-means
from sklearn.cluster import KMeans
# silhouette scores
from sklearn.metrics import silhouette_score, silhouette_samples
# dendrogram
from scipy.cluster.hierarchy import dendrogram
# agglomerative clustering
from sklearn.cluster import AgglomerativeClustering
# DBSCAN
from sklearn.cluster import DBSCAN
# Nearest Neighbors
from sklearn.neighbors import NearestNeighbors
# Interactive plot
import plotly.express as px 
# word cloud
from wordcloud import WordCloud
# stopwords and tokenization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
# branca
import branca

### 1.2. Data transformation

#### 1.2.1. Data loading

In [None]:
data = pd.read_table("data/flickr_data2.csv", sep=",")

data.info()

#### 1.2.2. Data cleaning

##### 1.2.2.1. Removing duplicates

In [None]:
data = data.drop_duplicates()
data = data.reset_index(drop=True)

data.info()

##### 1.2.2.2. Renaming columns

In [None]:
data.columns = data.columns.str.replace(' ', '')

##### 1.2.2.3. Normalizing data

In [None]:
# Check if "Unnamed:16" contains a valid year (4-digit number)
def is_year(value):
    return isinstance(value, (int, float)) and 1000 <= value <= 9999

# Get the indexes of the rows where "Unnamed:16" contains a year
index = data[data["Unnamed:16"].apply(is_year)].index

# Move the columns for each row concerned
for i in index:
    # Move the "date_upload_*" columns
    data.loc[i, "date_upload_minute"] = data.loc[i, "date_upload_hour"]
    data.loc[i, "date_upload_hour"] = data.loc[i, "date_upload_day"]
    data.loc[i, "date_upload_day"] = data.loc[i, "date_upload_month"]
    data.loc[i, "date_upload_month"] = data.loc[i, "date_upload_year"]
    data.loc[i, "date_upload_year"] = data.loc[i, "Unnamed:16"]
    data.loc[i, "Unnamed:16"] = None
    
    # Move the "date_taken_*" columns
    if "date_taken_minute" in data.columns:
        tmpYear = data.loc[i, "date_taken_minute"]
        data.loc[i, "date_taken_minute"] = data.loc[i, "date_taken_hour"]
        data.loc[i, "date_taken_hour"] = data.loc[i, "date_taken_day"]
        data.loc[i, "date_taken_day"] = data.loc[i, "date_taken_month"]
        data.loc[i, "date_taken_month"] = data.loc[i, "date_taken_year"]
        data.loc[i, "date_taken_year"] = tmpYear

# Delete the "Unnamed" columns
data = data.drop(columns=data.columns[data.columns.str.contains("Unnamed")])

data.info()

##### 1.2.2.4. Change data types

In [None]:
data["date_upload_minute"].unique()
data["date_upload_minute"] = pd.to_numeric(data["date_upload_minute"], errors="coerce")
data["date_upload_minute"] = data["date_upload_minute"].astype("Int64")
data["date_upload_year"] = data["date_upload_year"].astype("Int64")

# We remove duplicates that may have been revealed after the modifications

data = data.drop_duplicates()
data = data.reset_index(drop=True)
    
data.info()

### 1.3. Saving cleaned data

In [None]:
data.to_csv("data/dataset.csv", index=False)

Thanks to the data cleaning, we will be able to work on a clean dataset.
The original dataset had 420241 entries. 
After cleaning, we have 168099 entries.

## II. Data Analysis

### 2.1. Data Sampling

In [None]:
np.random.seed(42)

# Define the number of samples
n_samples = 50000 #168098 # Maximum number of samples

# Load the data
data = pd.read_csv("data/dataset.csv")

# Sample the data
data = data.sample(n_samples)

# Reset the index
data = data.reset_index(drop=True)

# Display the data
data.head()

### 2.2. Data Visualization

In [None]:
# Display the data on a map using Folium

# List of colors
colors = [
    "red", "blue", "green", "yellow", "purple", "orange", "pink", "white", "brown", "cyan", "magenta", "olive", "lime", "navy", "teal", "maroon", "silver", "gold", "crimson"
]

# Create the map
map = fl.Map(location=[48.8566, 2.3522], zoom_start=12)

# Add a layer to the map
fl.TileLayer('Cartodb dark_matter').add_to(map)

# Display the data on the map
for i in range(0, len(data)):
    fl.Circle(
        location=[data.loc[i]['lat'], data.loc[i]['long']],
        radius=10,
        color='crimson',
        fill=True,
        fill_color='dark'
    ).add_to(map)

# Fit the map to the bounds
map.fit_bounds(map.get_bounds())

# Display the map
map

### 2.3. Data Clustering

#### 2.3.1. K-means clustering

##### 2.3.1.1. Scaling data for K-means

In [None]:
# Select columns to keep
keep_col = ['lat', 'long']
df_clustering = data[keep_col]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_clustering)
scaled_data_df = pd.DataFrame(data=scaled_data, columns=df_clustering.columns)
scaled_data_df.head()

##### 2.3.1.2. Looking for the optimal number of clusters with the Elbow method

In [None]:
# Range of k (number of clusters)
range_k = range(1, len(scaled_data_df)//1000)

# List to store the inertia values (sum of squared distances)
inertias = []

# Loop over the range of k values to apply k-means
for i in range_k:
    # Apply k-means with i clusters
    print(f"Applying k-means with {i}/{len(scaled_data_df)//1000} clusters")
    kmeans = KMeans(n_clusters=i, init='k-means++')
    # Fit the data to the model
    kmeans.fit(scaled_data_df)
    # Append the inertia value to the list
    inertias.append(kmeans.inertia_)

# Visualize the inertia values to find the optimal number of clusters
n = len(inertias)//4
xticks_new = np.arange(1, n+1)
plt.plot(xticks_new, inertias[0:n], 'bx-')
plt.title('Finding the optimal number of clusters')
plt.xticks(xticks_new)
plt.xlabel('# clusters')
plt.ylabel('Sum of squared distances')
plt.show()

Using the Elbow method, we found that the optimal number of clusters is very small (around 3, 4, 5 or 6 clusters).

##### 2.3.1.3. Applying K-means clustering

In [None]:
# Number of clusters (found by the elbow method)
k = 6

# A bigger number of clusters
k_big = 100

# Create the k-means model
kmeans = KMeans(n_clusters=k, init='k-means++')
kmeans_big = KMeans(n_clusters=k_big, init='k-means++')

# Fit the data to the model
kmeans.fit(scaled_data_df)
kmeans_big.fit(scaled_data_df)

# Predict the clusters
labels = kmeans.labels_
print(f"k-means labels: {labels}")

labels_big = kmeans_big.labels_
print(f"k-means labels (big): {labels_big}")

inertia = kmeans.inertia_
print(f"Sum of squared distances: {inertia}")

inertia_big = kmeans_big.inertia_
print(f"Sum of squared distances (big): {inertia_big}")



##### 2.3.1.4. Visualizing K-means clustering

In [None]:
# Display the clusters on a map using Folium with different colors for each cluster

# Add the cluster labels to the data
data['cluster'] = labels
data['cluster_big'] = labels_big

# Create the maps
map = fl.Map(
    location=[48.8566, 2.3522], 
    zoom_start=12,
    position='absolute',
    left='0%',
    width='50%',
    height='100%'
)

map_big = fl.Map(
    location=[48.8566, 2.3522],
    zoom_start=12,
    position='absolute',
    left='50%',
    width='50%',
    height='100%'
)

# Add a layer to the map
fl.TileLayer('Cartodb dark_matter').add_to(map)
fl.TileLayer('Cartodb dark_matter').add_to(map_big)

for i in range(0, len(data)):
    fl.Circle(
        location=[data.loc[i]['lat'], data.loc[i]['long']],
        radius=2,
        color=colors[data.loc[i]['cluster']%len(colors)],
        fill=True,
        fill_color=colors[data.loc[i]['cluster']%len(colors)],
        popup=f"Cluster: {data.loc[i]['cluster']}"
    ).add_to(map)

    fl.Circle(
        location=[data.loc[i]['lat'], data.loc[i]['long']],
        radius=2,
        color=colors[data.loc[i]['cluster_big']%len(colors)],
        fill=True,
        fill_color=colors[data.loc[i]['cluster_big']%len(colors)],
        popup=f"Cluster: {data.loc[i]['cluster_big']}"
    ).add_to(map_big)

# Fit the maps to the bounds
map.fit_bounds(map.get_bounds())
map_big.fit_bounds(map_big.get_bounds())  

# Display the maps in a subplot
f = branca.element.Figure()
f.add_child(map)
f.add_child(map_big)
f

##### 2.3.1.5. Silhouette score for K-means clustering

In [None]:
silhouette_avg = silhouette_score(scaled_data, labels, metric='euclidean')
sample_silhouette_values = silhouette_samples(scaled_data, labels, metric='euclidean')
data['silhouette kmeans'] = sample_silhouette_values

silhouette_avg_big = silhouette_score(scaled_data, labels_big, metric='euclidean')
sample_silhouette_values_big = silhouette_samples(scaled_data, labels_big, metric='euclidean')
data['silhouette kmeans big'] = sample_silhouette_values_big

print(f"Average silhouette score: {silhouette_avg}")
print(f"Sample Silhouette values: {sample_silhouette_values}")

print(f"Average silhouette score (big): {silhouette_avg_big}")
print(f"Sample Silhouette values (big): {sample_silhouette_values_big}")

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Silhouette plot for k-means

y_lower = 10

for i in range(k):
    # Get silhouette scores for cluster i
    ith_cluster_values = sample_silhouette_values[labels == i]
    ith_cluster_values.sort()
    
    size_cluster_i = ith_cluster_values.shape[0]
    y_upper = y_lower + size_cluster_i
    
    # Fill the silhouette
    ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_values, alpha=0.7)
    
    # Label the silhouette plots
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, f'Cluster {i}')
        
    y_lower = y_upper + 10

# Silhouette plot for k-means (big)

y_lower = 10

for i in range(k_big):
    # Get silhouette scores for cluster i
    ith_cluster_values = sample_silhouette_values_big[labels_big == i]
    ith_cluster_values.sort()
    
    size_cluster_i = ith_cluster_values.shape[0]
    y_upper = y_lower + size_cluster_i
    
    # Fill the silhouette
    ax2.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_values, alpha=0.7)
    
    # Label the silhouette plots
    ax2.text(-0.05, y_lower + 0.5 * size_cluster_i, f'Cluster {i}')

    y_lower = y_upper + 10

# Add vertical line for average silhouette score
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
ax1.set_xlabel(f"Silhouette coefficient values for k = {k}")
ax1.set_ylabel("Cluster label")

# Add vertical line for average silhouette score (big)
ax2.axvline(x=silhouette_avg_big, color="red", linestyle="--")
ax2.set_yticks([])
ax2.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
ax2.set_xlabel(f"Silhouette coefficient values for k = {k_big}")
ax2.set_ylabel("Cluster label")

plt.show()

#### 2.3.2. Hierarchical clustering

##### 2.3.2.1. Function definition

In [None]:
# Function to plot the dendrogram of the hierarchical clustering
def plot_dendrogram(model, lbls, title='Hierarchical Clustering Dendrogram', x_title='Clusters', **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # Create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([
           model.children_,
           model.distances_,
           counts
       ]).astype(float)

    fig = plt.figure(figsize=(12, 8))
    
    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, labels=lbls, leaf_rotation=90)
    
    plt.title(title)
    plt.xlabel(x_title)
    plt.ylabel('Distance')
    
    fig.show()
    
    return fig


# Hierarchical clustering with different linkages
def hierarchical(data, labels, metric='euclidean', linkage='average', n_clusters=None, dist_thres=None):
    model = AgglomerativeClustering(distance_threshold=dist_thres, n_clusters=n_clusters, metric=metric, linkage=linkage, compute_full_tree=True, compute_distances=True)
    model = model.fit(data)
    
    txt_title = 'Hierarchical Clustering Dendrogram' + ', linkage: ' + linkage + ', n_clusters: ' + str(n_clusters)
    f = plot_dendrogram(model=model, lbls=labels, title=txt_title, x_title='Clusters')
    
    return model, f

##### 2.3.2.2. Applying hierarchical clustering

In [None]:
# Sample the data
hierarchical_data = data.sample(1000)
hierarchical_data = hierarchical_data.reset_index(drop=True)

# Select columns to keep
keep_col = ['lat', 'long']
df_clustering = hierarchical_data[keep_col]

# Standardize the data
scaler = StandardScaler()
scaled_data_hierarchical = scaler.fit_transform(df_clustering)
scaled_data_df_hierarchical = pd.DataFrame(data=scaled_data_hierarchical, columns=df_clustering.columns)

# Number of clusters
k_hierarchical = 50

linkage = ['complete', 'average', 'single']

for link in linkage:
    m, f = hierarchical(scaled_data_hierarchical, list(scaled_data_df_hierarchical.index), metric='euclidean', linkage=link, n_clusters=k_hierarchical, dist_thres=None)
    
    hierarchical_data['cluster ' + link] = m.labels_
    silhouette_avg = silhouette_score(scaled_data_hierarchical, m.labels_, metric='euclidean')
    sample_silhouette_values = silhouette_samples(scaled_data_hierarchical, m.labels_, metric='euclidean')
    hierarchical_data['silhouette ' + link] = sample_silhouette_values
    
    print(f"Linkage: {link}, silhouette score: {silhouette_avg}")


##### 2.3.2.3. Visualizing hierarchical clustering

In [None]:
# Create the maps
map_single = fl.Map(
    location=[48.8566, 2.3522], 
    zoom_start=12,
    position='absolute',
    left='0%',
    width='33%',
    height='100%'
)

map_complete = fl.Map(
    location=[48.8566, 2.3522],
    zoom_start=12,
    position='absolute',
    left='33%',
    width='33%',
    height='100%'
)

map_average = fl.Map(
    location=[48.8566, 2.3522],
    zoom_start=12,
    position='absolute',
    left='66%',
    width='33%',
    height='100%'
)

# Add a layer to the maps
fl.TileLayer('Cartodb dark_matter').add_to(map_single)
fl.TileLayer('Cartodb dark_matter').add_to(map_complete)
fl.TileLayer('Cartodb dark_matter').add_to(map_average)

# add points to the map with different colors for each cluster
for i in range(0, len(hierarchical_data)):

    fl.Circle(
        location=[hierarchical_data.loc[i]['lat'], hierarchical_data.loc[i]['long']],
        radius=2,
        color=colors[hierarchical_data.loc[i]['cluster single']%len(colors)],
        fill=True,
        fill_color=colors[hierarchical_data.loc[i]['cluster single']%len(colors)],
        popup=f"Cluster: {hierarchical_data.loc[i]['cluster single']}"
    ).add_to(map_single)

    fl.Circle(
        location=[hierarchical_data.loc[i]['lat'], hierarchical_data.loc[i]['long']],
        radius=2,
        color=colors[hierarchical_data.loc[i]['cluster average']%len(colors)],
        fill=True,
        fill_color=colors[hierarchical_data.loc[i]['cluster average']%len(colors)],
        popup=f"Cluster: {hierarchical_data.loc[i]['cluster average']}"
    ).add_to(map_average)

    fl.Circle(
        location=[hierarchical_data.loc[i]['lat'], hierarchical_data.loc[i]['long']],
        radius=2,
        color=colors[hierarchical_data.loc[i]['cluster complete']%len(colors)],
        fill=True,
        fill_color=colors[hierarchical_data.loc[i]['cluster complete']%len(colors)],
        popup=f"Cluster: {hierarchical_data.loc[i]['cluster complete']}"
    ).add_to(map_complete)


# Fit the maps to the bounds
map_single.fit_bounds(map_single.get_bounds())
map_complete.fit_bounds(map_complete.get_bounds())
map_average.fit_bounds(map_average.get_bounds())

# Display the maps in a subplot
f = branca.element.Figure()
f.add_child(map_single)
f.add_child(map_average)
f.add_child(map_complete)
f

#### 2.3.3. DBSCAN clustering

##### 2.3.3.1. Looking for the optimal epsilon value

In [None]:
# Minimum number of points in a neighborhood to define a cluster
min_pts = len(data)//1000

def find_optimal_eps(data, min_pts):
    # Calculate distances to k-nearest neighbors
    neigh = NearestNeighbors(n_neighbors=min_pts)
    neigh.fit(data)
    distances, _ = neigh.kneighbors(data)
    
    
    # Sort distances to kth neighbor in ascending order
    k_distances = np.sort(distances[:, min_pts-1])
    
    # Create plot
    fig = px.line(
        x=range(len(k_distances)),
        y=k_distances,
        title=f'K-distance Graph (k={min_pts})',
        labels={'x': 'Points sorted by distance', 
                'y': f'Distance to {min_pts}th nearest neighbor'}
    )
    
    return fig

fig = find_optimal_eps(scaled_data, min_pts)
fig.show()

This method is not suitable for our dataset because it is too large.

##### 2.3.3.2. Selecting the optimal epsilon value

According to our tests, the optimal epsilon value that we found is $0.004$.
With a min_samples value of $len(data) // 1000$.

In [None]:
optimal_eps = 0.006
min_samples = len(data)//1000

##### 2.3.3.3. Applying DBSCAN clustering

In [None]:
dbscan = DBSCAN(eps=optimal_eps, min_samples=min_samples)
labels = dbscan.fit_predict(scaled_data)

# Print number of clusters and noise points
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
print(f"\nNumber of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")

data['cluster dbscan'] = labels

# Remove line that have no cluster (cluster dbscan == -1)
no_noise_data = data[data['cluster dbscan'] != -1]
no_noise_data.reset_index(drop=True, inplace=True)

##### 2.3.3.4. Visualizing DBSCAN clustering

In [None]:
# Create the map
map = fl.Map(location=[48.8566, 2.3522], zoom_start=12)

# Add a layer to the map
fl.TileLayer('Cartodb dark_matter').add_to(map)

# Add points to the map with different colors for each cluster
for i in range(len(no_noise_data)):  # Utiliser la taille réelle des données
    cluster = no_noise_data.loc[i]['cluster dbscan']
    
    # Ignore noise points (-1)
    if cluster == -1:
        continue
    
    # Add cluster points to the map
    fl.Circle(
        location=[no_noise_data.loc[i]['lat'], no_noise_data.loc[i]['long']],
        radius=2,
        color=colors[cluster % len(colors)],
        fill=True,
        fill_color=colors[cluster % len(colors)],
        popup=f"Cluster: {cluster}\n"
    ).add_to(map)

# Fit the map to the bounds
map.fit_bounds(map.get_bounds())

# Show the map
map

##### 2.3.3.5. Silhouette score for DBSCAN clustering

In [None]:
# Scale no noise data
keep_col = ['lat', 'long']
no_noise_df_clustering = no_noise_data[keep_col]
no_noise_scaler = StandardScaler()
no_noise_scaled_data = scaler.fit_transform(no_noise_df_clustering)
no_noise_scaled_data_df = pd.DataFrame(data=no_noise_scaled_data, columns=no_noise_df_clustering.columns)

# Redefine labels
labels = no_noise_data['cluster dbscan']

# Silhouette score
silhouette_avg = silhouette_score(no_noise_scaled_data, labels, metric='euclidean')
sample_silhouette_values = silhouette_samples(no_noise_scaled_data, labels, metric='euclidean')
no_noise_data['silhouette dbscan'] = sample_silhouette_values

print(f"Average silhouette score: {silhouette_avg}")
print(f"Sample Silhouette values: {sample_silhouette_values}")

fig, ax = plt.subplots(1, 1, figsize=(15, 5))

# Silhouette plot for DBSCAN

y_lower = 10

for i in range(n_clusters):
    # Get silhouette scores for cluster i
    ith_cluster_values = sample_silhouette_values[labels == i]
    ith_cluster_values.sort()
    
    size_cluster_i = ith_cluster_values.shape[0]
    y_upper = y_lower + size_cluster_i
    
    # Fill the silhouette
    ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_values, alpha=0.7)
    
    # Label the silhouette plots
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, f'Cluster {i}')
        
    y_lower = y_upper + 10

# Add vertical line for average silhouette score
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
ax.set_yticks([])
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
ax.set_xlabel(f"Silhouette coefficient values for DBSCAN (eps={optimal_eps}, min_samples={min_samples}) with average silhouette score: {silhouette_avg}")
ax.set_ylabel("Cluster label")

>## III. Text pattern mining

### 3.1. Preprocessing

As with other types of data, data preprocessing plays a major role with textual data.
- Removing stopwords (words that are used a lot while not bringing meaningful information,
such as “is”, “the”, “a”, ... or their french equivalent “est”, “le”, “un”, ...).
- Similarly, it will be interesting to remove frequent words in the dataset that are not a
stopword or meaningful (e.g. “picture”). You might consider visualizing the data with a
word cloud.

In [None]:
# Cleaning of tags and title columns

# We remove special characters and numbers in tags and title, but not accents or other common characters
no_noise_data['tags'] = no_noise_data['tags'].str.replace(r'[^a-zA-Z éèôàëêöâäîïùûüç]', ' ', regex=True)
no_noise_data['title'] = no_noise_data['title'].str.replace(r'[^a-zA-Z éèôàëêöâäîïùûüç]', ' ', regex=True)

# We put everything in lowercase
no_noise_data['tags'] = no_noise_data['tags'].str.lower()
no_noise_data['title'] = no_noise_data['title'].str.lower()

# Replace NaN with empty strings
no_noise_data['tags'].fillna('', inplace=True)
no_noise_data['title'].fillna('', inplace=True)

# Delete English and French stopwords in title
stop_words = set(stopwords.words('english'))
stop_words_fr = set(stopwords.words('french'))
no_noise_data['title'] = no_noise_data['title'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
no_noise_data['title'] = no_noise_data['title'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words_fr]))

# Delete frequent words and solo letters
frequent_words = ['photo', 'ddc', 'groupeserveur', 'picture', 'lumix', 'panasonic', 'image', 'photography', 'photograph', 'photographie', 'instagram', 'instagramapp', 'uploaded', 'lyon', 'france', 'flickr', 'photographer', 'photographie', 'streetphotography','iphone','lesphotosdevoyage','img','jpg','jpeg','png','iphoneography']
solo_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
no_noise_data['title'] = no_noise_data['title'].apply(lambda x: ' '.join([item for item in x.split() if item not in frequent_words and item not in solo_letters]))
no_noise_data['tags'] = no_noise_data['tags'].apply(lambda x: ' '.join([item for item in x.split() if item not in frequent_words and item not in solo_letters]))

# Delete words with less than 3 characters
no_noise_data['title'] = no_noise_data['title'].apply(lambda x: ' '.join([item for item in x.split() if len(item) > 3]))
no_noise_data['tags'] = no_noise_data['tags'].apply(lambda x: ' '.join([item for item in x.split() if len(item) > 3]))

# Delete words that contain frequent words
no_noise_data['title'] = no_noise_data['title'].apply(lambda x: ' '.join([item for item in x.split() if not any(word in item for word in frequent_words)]))
no_noise_data['tags'] = no_noise_data['tags'].apply(lambda x: ' '.join([item for item in x.split() if not any(word in item for word in frequent_words)]))

# We clean the no_noise_data again
no_noise_data['tags'].fillna('', inplace=True)
no_noise_data['title'].fillna('', inplace=True)

# Join tags and titles in a new column
no_noise_data['text'] = no_noise_data['title'] + ' ' + no_noise_data['tags']
                                    
# We try to tokenize the text column, but tokenization is ineffective for tags and title
# because words are often stuck together, and there is no punctuation, verbs, etc.
# Tokenization is therefore useless for this particular dataset
# without using word_tokenize
no_noise_data['text'] = no_noise_data['text'].apply(lambda x: word_tokenize(x))

# We create a list of all the words in the text column
words = []
for i in range(len(no_noise_data)):
    words += no_noise_data.loc[i]['text']

### 3.2. Text vizualisation

In [None]:
# We create a word cloud with the words in the text column
wordcloud = WordCloud(width = 800, height = 800,
            background_color ='white',
            min_font_size = 10).generate(' '.join(words))

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

### 3.3. Text clustering

In [None]:
# Set the words and their frequency in a dictionary
words = {}
for i in range(len(no_noise_data)):
    for word in no_noise_data.at[i, 'text']:
        if word in words:
            words[word] += 1
        else:
            words[word] = 1

# We create a dictionary with the words and their frequency by cluster
words_by_cluster = {}
for i in range(n_clusters):
    words_by_cluster[i] = {}

for i in range(len(no_noise_data)):
    for word in no_noise_data.at[i, 'text']:
        if word in words_by_cluster[no_noise_data.at[i, 'cluster dbscan']]:
            words_by_cluster[no_noise_data.at[i, 'cluster dbscan']][word] += 1
        else:
            words_by_cluster[no_noise_data.at[i, 'cluster dbscan']][word] = 1

# Calculating TF-IDF for each word (Term Frequency - Inverse Document Frequency)
tf_idf = {}

for i in range(n_clusters):
    tf_idf[i] = {}

for i in range(len(no_noise_data)):
    for word in no_noise_data.at[i, 'text']:
        tf = words_by_cluster[no_noise_data.at[i, 'cluster dbscan']][word] / len(no_noise_data)
        idf = len(no_noise_data) / words[word]
        tf_idf[no_noise_data.at[i, 'cluster dbscan']][word] = tf * idf
    no_noise_data.at[i, 'text'] = ' '.join(no_noise_data.at[i, 'text'])

# We add a column for each line with the 10 most important words of the cluster to which it belongs
for i in range(len(no_noise_data)):
    no_noise_data.at[i, 'important_words'] = ' '.join(sorted(tf_idf[no_noise_data.at[i, 'cluster dbscan']], key=tf_idf[no_noise_data.at[i, 'cluster dbscan']].get, reverse=True)[:10])

### 3.4. Date classification

In [None]:
# Column date to datetime format and creation of a column date with the date and time of the photo taken in datetime format (year, month, day, hour, minute)
no_noise_data.rename(columns={'date_taken_year': 'year', 'date_taken_month': 'month', 'date_taken_day': 'day', 'date_taken_hour': 'hour', 'date_taken_minute': 'minute'}, inplace=True)
no_noise_data['date'] = pd.to_datetime(no_noise_data[['year', 'month', 'day', 'hour', 'minute']])

# We create a dictionary with the min and max dates for each cluster
dates = {}
for i in range(n_clusters):
    dates[i] = {'min': None, 'max': None}
for i in range(len(no_noise_data)):
    if dates[no_noise_data.at[i, 'cluster dbscan']]['min'] is None or no_noise_data.at[i, 'date'] < dates[no_noise_data.at[i, 'cluster dbscan']]['min']:
        dates[no_noise_data.at[i, 'cluster dbscan']]['min'] = no_noise_data.at[i, 'date']
    if dates[no_noise_data.at[i, 'cluster dbscan']]['max'] is None or no_noise_data.at[i, 'date'] > dates[no_noise_data.at[i, 'cluster dbscan']]['max']:
        dates[no_noise_data.at[i, 'cluster dbscan']]['max'] = no_noise_data.at[i, 'date']

# We now look at the difference between the min and max dates for each cluster
# If the difference is very large (arbitrarily more than 1 month), this means that it is a permanent temporal cluster
# We therefore add another column for each cluster that indicates whether it is a punctual or permanent temporal cluster

for i in range(len(no_noise_data)):
    if (dates[no_noise_data.at[i, 'cluster dbscan']]['max'] - dates[no_noise_data.at[i, 'cluster dbscan']]['min']).days > 30:
        no_noise_data.at[i, 'temporal'] = 'permanent'
    else:
        no_noise_data.at[i, 'temporal'] = 'punctual'

### 3.5. Final Visualization

In [None]:
map = fl.Map(location=[48.8566, 2.3522], zoom_start=12)

fl.TileLayer('Cartodb dark_matter').add_to(map)

# Add points to the map with different colors for each cluster
for i in range(len(no_noise_data)):

    cluster = no_noise_data.loc[i]['cluster dbscan']
    temporal_cluster = no_noise_data.loc[i]['temporal_cluster']
    important_words = no_noise_data.loc[i]['important_words']
    
    fl.Circle(
        location=[no_noise_data.at[i,"lat"], no_noise_data.at[i,"long"]], 
        popup=f"Cluster: {cluster}\n{temporal_cluster}",
        tooltip=f"{important_words}",
        radius = 10,
        color = colors[cluster % len(colors)],
        fill_color = colors[cluster % len(colors)],
        fill = True,
    ).add_to(map)

# Fit the map to the bounds
map.fit_bounds(map.get_bounds())

# Display the map
map.save("map.html")