In [None]:
# Libraries: Standard ones
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Library for boxplots
import seaborn as sns

# K-means function
from sklearn.cluster import KMeans

# Functions for silhouette
from sklearn.metrics import silhouette_samples, silhouette_score

# Function to standardize the data 
from sklearn.preprocessing import scale

# Functions for hierarchical clustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist



#for map 
import folium
from folium.plugins import FastMarkerCluster
from folium.vector_layers import CircleMarker
import branca.colormap as cm

In [None]:
### The first step is to visualize the data in order to understand it. Let's show a table.
temperature=pd.read_csv("temperat.csv", sep=";")
temperature=temperature.rename(columns={'Unnamed: 0': 'Ville'}) # renaming the first column
temperature

In [None]:
### The second step is to draw some statistics from the data.
temperature.describe()

In [None]:
### Let's present these results on a boxplot using Seasborne, as there is still a lot of data. 
plt.figure()
plt.title("boxplot distribution for the temperature")
temperature_boxplot=sns.boxplot(data=temperature.drop(["Moyenne", "Amplitude","Latitude","Longitude","Region"], axis=1))

In [None]:
### Represented like this, we can't extract clusters, so let's represent it differently
value_to_visualize = input("What value do you want to visualize ( Amplitude, Moyenne, etc) ? \n") #so we can explore the data better
plt.figure()
plt.title("2D visualization of the data : Latitude / Longitude")
plt.scatter(temperature["Latitude"], temperature["Longitude"],c=temperature[value_to_visualize]) 

In [None]:
### Let's now present the temperature according to the city
plt.figure()
plt.title("2D visualization of the data : Latitude / Longitude")
plt.scatter(temperature["Ville"], temperature[value_to_visualize])

In [None]:
# We will now display all the points on a map coloured by the field we selected above
file=temperature
field_used=value_to_visualize

map = folium.Map(tiles='cartodbpositron', zoom_start=3 ,location=[48.499998 ,23.3833318])

linear=cm.linear.RdYlBu_06.scale(0, 1)

for index,row in file.iterrows():
#     keyon_cluster = folium.Marker(location=[row['Latitude'],row['Longitude']],popup=row['Ville']).add_to(m)
    folium.CircleMarker(location=[row['Latitude'],row['Longitude']], 
                        radius=5,
                        popup=str(row['Ville'])+': '+str(row[field_used]), 
                        line_color=None,
                        fill_color=linear(1-(row[field_used]-file[field_used].min())/(file[field_used].max()-file[field_used].min())),
                        fill_opacity=1,
                        color=linear(1-(row[field_used]-file[field_used].min())/(file[field_used].max()-file[field_used].min()))).add_to(map)
    


In [None]:
map



In [None]:
### We can see the results are relevant and correspond to what we expected. 
 # Let's now see the data of the mean temperature by region 

plt.figure()
plt.title('2D representation of temperature dataset')
plt.scatter(temperature['Region'], temperature[field_used]) #or change by Moyenne
plt.show()


In [None]:
# We can see that for different fields ( Moyenne / Amplitude), there are a lot of disparities in the given regions.
# Let's use our knowledge in clustering to find clusters to regroup cities with similar caracteristics.


#Let's try to find clusters: 
plt.figure()

new_temp = temperature.drop(['Ville','Region','Latitude','Longitude',"Amplitude","Moyenne" ], axis=1)

kmeans = KMeans(n_clusters=4,n_init=10,init='random').fit(new_temp) #trying to find 4 clusters, 4 being a purely aritrairy number
centers=kmeans.cluster_centers_
kmeans.labels_
plt.scatter(temperature['Latitude'], temperature['Longitude'],c=kmeans.labels_) # === draw on map = better

In [None]:
##Let's do the Elbow method to find the optimal number of clusters, using the silhouette score to compute the cost
range_n_clusters = range(2,15) #trying to find the optimal number of clusters which suits best our dataset ( that has "30~cities")
sil_score = []

for n in range_n_clusters:
    sil_mean=[]
    for i in range(20): # so we compute it 20 times and take the mean to know exatly wich number of clusters would be better
        clusterer = KMeans(n_clusters=n, n_init=13, init = 'random').fit(new_temp) # Clustering with the given number of clusters
        cluster_labels = clusterer.labels_ # Extract the labels of each cluster
        silhouette_avg = silhouette_score(new_temp, cluster_labels) # Corresponding silhouette score
        sil_mean.append(silhouette_avg) # Adding the silhouette score corresponding to the number of clusters to our list so 
    sil_score.append(np.mean(sil_mean))                      # we can print it afterward
    #print("For n =", n, "Silhouette_score:", silhouette_avg) 
plt.figure()
plt.title('Elbow method to find the optimal number of clusters')
plt.scatter(range_n_clusters, sil_score)

In [None]:
# When running several times the function below we get 5 or sometimes 6 clusters as the optimal number. We will continue with 5 ! 
# but first, let's see what happens when we use 5 clusters and visualize it on the map, coloring the cities of one cluster with the same color
kmeans = KMeans(n_clusters=5,n_init=10,init='random').fit(new_temp)

new_temp_with_labels=pd.concat([temperature,pd.DataFrame(data=kmeans.labels_,columns=['label'])], axis=1, sort=False)
#We select the file we're going to use
file=new_temp_with_labels
field_used='label'

linear=cm.linear.RdYlBu_06.scale(0, 1)


map2 = folium.Map(tiles='cartodbpositron', zoom_start=3 ,location=[48.499998 ,23.3833318])

for index,row in file.iterrows():
#     keyon_cluster = folium.Marker(location=[row['Latitude'],row['Longitude']],popup=row['Ville']).add_to(m)
    folium.CircleMarker(location=[row['Latitude'],row['Longitude']], 
                        radius=5,
                        popup=str(row['Ville'])+': '+str(row[field_used]), 
                        line_color=None,
                        fill_color=linear(1-(row[field_used]-file[field_used].min())/(file[field_used].max()-file[field_used].min())),
                        fill_opacity=1,
                        color=linear(1-(row[field_used]-file[field_used].min())/(file[field_used].max()-file[field_used].min()))).add_to(map2)
map2    

In [None]:
""" We can see that proximity in term of distance is often related to proximity in terms of temperature, which is logic, but some points have the same climate and are very far away.
That means there is another factor that creates the climate. It could be the proximity with the ocean, or altitude
(naively : we know there are several types of climate, for example continental climate whose major caracteristic is the really high
amplitude. Let's try to study this amplitude ! )
"""
## We will now try to regroup the different amplituds for the temperature and try to find which are the similar ones : we know that
# a continental climate is when there is a huge amplitude, so let's see if it's correlated with the geographic position


temp_amplitude=temperature[['Ville','Amplitude']]

temp_ampl_no_city=temperature['Amplitude'].reset_index()

range_n_clusters = [2, 3, 4, 5, 6,7,8,9,10]
for n in range_n_clusters:
    kmeans_ampl = KMeans(n_clusters=n, n_init=10).fit(temp_ampl_no_city) # Clustering with the given number of clusters
    cluster_labels = kmeans_ampl.labels_ # Extract the labels
    silhouette_avg = silhouette_score(temp_ampl_no_city, cluster_labels) # Corresponding silhouette score
   
    # Display the results
   # print("For n =", n, "Silhouette_score:", silhouette_avg) results give 5 as best one 
kmeans_ampl = KMeans(n_clusters=5, n_init=10).fit(temp_ampl_no_city)
y=temp_amplitude.sort_values(by='Amplitude')
map_for_ampl=pd.concat([temperature,pd.DataFrame(data=kmeans_ampl.labels_,columns=['cluster_labels'])], axis=1, sort=False)
field_used='cluster_labels'
file=map_for_ampl
map3 = folium.Map(tiles='cartodbpositron', zoom_start=3 ,location=[48.499998 ,23.3833318])

for index,row in file.iterrows():
#     keyon_cluster = folium.Marker(location=[row['Latitude'],row['Longitude']],popup=row['Ville']).add_to(m)
    folium.CircleMarker(location=[row['Latitude'],row['Longitude']], 
                        radius=5,
                        popup=str(row['Ville'])+': '+str(row[field_used]), 
                        line_color=None,
                        fill_color=linear(1-(row[field_used]-file[field_used].min())/(file[field_used].max()-file[field_used].min())),
                        fill_opacity=1,
                        color=linear(1-(row[field_used]-file[field_used].min())/(file[field_used].max()-file[field_used].min()))).add_to(map3)
map3  


In [None]:
#The results aren't very simple to analyse so we can't find which type of climate a city belongs to without having 
#more data about the cities. We can however conjecture that cities that are further in land have higher amplitude 
#whereas cities close to oceans / seas have more temperate climates, with some exceptions
#Let's 


### Let's now try to see for each group obtained with the 5 clusters obtained previously 
# if we can draw seasons from them to see how many seasons would there be
group1=new_temp_with_labels[new_temp_with_labels['label']==0]
group2=new_temp_with_labels[new_temp_with_labels['label']==1]
group3=new_temp_with_labels[new_temp_with_labels['label']==2]
group4=new_temp_with_labels[new_temp_with_labels['label']==3]
group5=new_temp_with_labels[new_temp_with_labels['label']==4]


#now we do k-means but to get the seasons ( focusing on group 2, this can be changed )

group2_month=group2.drop(['Ville','Moyenne','Amplitude','Latitude','Longitude','Region','label'],axis=1)

#We first need to transpose our dataset
group2_month_rev=group2_month.T
group2_month_rev_clean=group2_month_rev.reset_index().drop(['index'],axis=1)

#and now we perform k-means on the reverted dataframe

range_n_clusters = [2, 3, 4, 5, 6,7,8,9,10]
for n in range_n_clusters:
    kmeans_seasons = KMeans(n_clusters=n, n_init=10).fit(group2_month_rev_clean) # Clustering with the given number of clusters
    cluster_labels = kmeans_seasons.labels_ # Extract the labels
    silhouette_avg = silhouette_score(group2_month_rev_clean, cluster_labels) # Corresponding silhouette score
   
    # Display the results
   # print("For n =", n, "Silhouette_score:", silhouette_avg)  => to find the elbow
    
    
# For group 2, we should choose k=2 or k=5 
# k=2

kmeans_seasons = KMeans(n_clusters=2,n_init=10,init='random').fit(group2_month_rev_clean)
print( " months that are together if there were two seasons : ", kmeans_seasons.labels_ )



#k=5
kmeans_seasons = KMeans(n_clusters=5,n_init=10,init='random').fit(group2_month_rev_clean)
print( " months that are together if there were five seasons : ", kmeans_seasons.labels_ ) 
#group2 



In [None]:

#Now we are going to perform hierarchical clustering

# We standardize the columns of interest
new_temp_scaled=pd.DataFrame(scale(new_temp),columns=new_temp.columns,index=new_temp.index)  # scale(food) returns a numpy array, so use pd.Dataframe to reconstruct your dataframe


# We apply hierarchical clustering with dissimilarity measure "ward"
Z = linkage(new_temp_scaled, 'ward') # 'average' is the method used to compute the distance. Metric is "euclidian" by default

# We represent the corresponding Dendrogram

#Since we want to display the Dendrogram with cities and not just index, we create the label_city list

label_city=[]
for index,row in temperature.iterrows():
    label_city.append(row['Ville'])
    
#And now we can plot
plt.figure(figsize=(7, 7))
plt.title('Hierarchical Clustering Dendrogram')
plt.ylabel('city')
plt.xlabel('distance')
dendrogram(
    Z,
    orientation='right',
    labels=label_city
)
plt.show()

#And here is the dendrogram !! 

In [None]:
"""What could be intersteting would be to use the PCA method to see if we can run another kmeans that would be as performing,
but withdrawing useless parameters therefore having a faster and less resource-consuming algorithm"""

new_temperature = temperature.drop(temperature.columns[[0,17]], axis='columns') #on enleve la colonne des villes
#print(new_temperature)
from sklearn.preprocessing import StandardScaler
tempCPA = new_temperature.values
std_scale = StandardScaler().fit(tempCPA)
tempCPA_scaled = std_scale.transform(tempCPA)
#print(tempCPA_scaled)

from sklearn.decomposition import PCA
pca = PCA().fit(tempCPA_scaled[:,:12])
plt.plot(np.cumsum(pca.explained_variance_ratio_))
pca.explained_variance_ratio_.cumsum()

In [None]:
temp = pca.transform(tempCPA_scaled[:,:12])

plt.scatter(temp[:,0], temp[:,1], c=new_temperature['Moyenne'])
plt.xlabel('1st principal component')
plt.ylabel('2nd principal component')
plt.xlim(-6, 7)
plt.ylim(-3, 3)

for i in range(temp.shape[0]):
    plt.text(temp[i,0], temp[i,1], temperature["Ville"][i][0])

In [None]:
components = pca.components_
for i, (x, y) in enumerate(zip(components[0,:], components[1,:])):
    plt.plot([0, x], [0, y], color='k')
    plt.text(x, y, new_temperature.columns[i])

plt.plot([-0.6, 0.6], [0, 0], color='grey', ls='--')
plt.plot([0, 0], [-0.6, 0.6], color='grey', ls='--')

plt.xlim(-0.5, 0.5)
plt.ylim(-0.5, 0.5);

In [None]:
# Based on the circle of correlations obtained and the graph of the position of the individuals, we can caracterize
# cities according to their temperature. That is to say that the cities on the right side of the graph will have 
# higher temperatures than the left-hand side. Indeed all the variables have a positive correlation 
# with axis 1. In addition, the variables concerning the winter months are positively correlated with axis 2. 
# while the variables for the summer months are negatively correlated with Axis 2. Thus the 
# cities at the top of the graph will have a mild winter but a cold summer and cities at the bottom of the graph will have
# a cold winter and a warm summer. 
# Following this resonance, Moscow is a city with low temperatures and very cold winters and 
# hot summers. In contrast to Seville which is a city where it seems to be hot all year round.



_, axes = plt.subplots(ncols=4, figsize=(16,4))
for i, (ax, col) in enumerate(zip(axes, ['Latitude', 'Longitude', 'Moyenne', 'Amplitude'])):
    ax.scatter(temp[:,0], temperature[col])
    ax.set_title(f'1st component vs {col}')
    
_, axes = plt.subplots(ncols=4, figsize=(16,4))

for i, (ax, col) in enumerate(zip(axes, ['Latitude', 'Longitude', 'Moyenne', 'Amplitude'])):
    ax.scatter(temp[:,1], temperature[col])
    ax.set_title(f'2nd component vs {col}')

In [None]:
from sklearn.decomposition import PCA
pca = PCA().fit(tempCPA_scaled[:,:16])
components = pca.components_
for i, (x, y) in enumerate(zip(components[0,:12], components[1,:12])):
    plt.plot([0, x], [0, y], color='k')
    plt.text(x, y, new_temperature.columns[i])
    
for i, (x, y) in enumerate(zip(components[0,12:16], components[1,12:16])):
    plt.plot([0, x], [0, y], color='b')
    plt.text(x, y, new_temperature.columns[i+12])

plt.plot([-0.6, 0.6], [0, 0], color='grey', ls='--')
plt.plot([0, 0], [-0.6, 0.6], color='grey', ls='--')

plt.xlim(-0.5, 0.5)
plt.ylim(-0.5, 0.5);

In [None]:
# The mean is highly correlated with Component 1, one could even say that Component 1 is the mean. 
# because the correlation is close to 1.
# Moreover amplitude and longitude are negatively correlated with component 2.
# so the cities at the bottom of the graph have a strong annual amplitude like Kiev and the cities at the top of the graph do not.
# have a low thermal amplitude like Dublin. 



#Accordingly to what we expected, the results of the PCA is the same as the one for the kmeans, we haven't " lost " any useful data