# Sincere_clustering 

## A) Importing Libraries and defining functions

In [None]:
# %pip install scikit-fuzzy

In [None]:
# %pip install yellowbrick

In [None]:
#Basic imports
import pandas as pd
import numpy as np

#Fuzzy model imports

import skfuzzy as fuzz
from fcmeans import FCM

#Data Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import scipy.cluster.hierarchy as shc
from scipy.cluster.hierarchy import dendrogram, linkage

from yellowbrick.cluster import KElbowVisualizer

#Sci-kit learn imports
from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import metrics
from sklearn.metrics import davies_bouldin_score, silhouette_score

# Blocking warnings
import warnings
warnings.filterwarnings('ignore')

: 

In [None]:
# Elbow Method for K means using Yellowbrick lib
def Yelbow(df):
    model = KMeans()
    visualizer = KElbowVisualizer(model, k=(2,21), timings= True, random_state=8) # k is range of number of clusters.
    visualizer.fit(df)        # Fit data to visualizer
    visualizer.show()        # Finalize and render figure

# Silhouette for kmeans using Yellowbrick lib
def Ysilhouette(df):
    model = KMeans()
    visualizer = KElbowVisualizer(model, k=(2,21),metric='silhouette', timings= True, random_state=8) # k is range of number of clusters.
    visualizer.fit(df)        # Fit the data to the visualizer
    visualizer.show()        # Finalize and render the figure

#Davies Bouldin from sklearn
def DaviesBouldin(data, center):

    #instantiate kmeans
    kmeans = KMeans(n_clusters=center)
    # Then fit the model to your data using the fit method
    model = kmeans.fit_predict(data)
    
    # Calculate Davies Bouldin score
    score = davies_bouldin_score(data, model)
    
    return score

#Dendrogram viz
def VizDendrogram(df):
    plt.figure(figsize=(10, 7))  
    plt.title("Dendrograms df_merged_kmeans_sample")  
    dend = shc.dendrogram(shc.linkage(df, method='ward'))

## B) Performing itens clustering using K-Means

In [None]:
# importing data
df_movies_4kmeans = pd.read_csv('df_movies_4kmeans.csv')

# dropping unecessary columns
df_movies_4kmeans_filtered = df_movies_4kmeans.drop(['movieId','age_movie_when_rated','time_of_day_1','time_of_day_2','time_of_day_3','time_of_day_4','weekday_0','weekday_1', 'weekday_2', 'weekday_3','weekday_4','weekday_5', 'weekday_6', 'title', 'imdbId', 'Year', 'imdbVotes'], axis=1)

### 1) Creating a sample to search for the optmal K

In [None]:
# full database stats listed
df_merged_kmeans_stats = df_movies_4kmeans_filtered.describe()
df_merged_kmeans_stats.drop(['count', 'min','25%','75%','max'], inplace=True, axis=0)
df_merged_kmeans_stats

In [None]:
# splitting a sample of the database and listing stats
df_merged_kmeans_sample = df_movies_4kmeans_filtered.sample(frac=0.30, random_state=8)
df_merged_kmeans_sample_stats = df_merged_kmeans_sample.describe()
df_merged_kmeans_sample_stats.drop(['count', 'min','25%','75%','max'], inplace=True, axis=0)
df_merged_kmeans_sample_stats

### 2) Looking for the best k

In [None]:
# ajusting scale of the sample dataset using min max scaler
df_merged_kmeans_sample = StandardScaler().fit_transform(df_merged_kmeans_sample)

#### Silhouette Score

In [None]:
Ysilhouette(df_merged_kmeans_sample)

#### Elbow Method

In [None]:
Yelbow(df_merged_kmeans_sample)

#### Davies Bouldin

In [None]:
scores = []
centers = list(range(2,30))

for center in centers:
    scores.append(DaviesBouldin(df_merged_kmeans_sample, center))
    
plt.plot(centers, scores, linestyle='--', marker='o', color='b');
plt.xlabel('K');
plt.ylabel('Davies Bouldin score');
plt.title('Davies Bouldin score vs. K');

#### Dendrogram

In [None]:
VizDendrogram(df_merged_kmeans_sample)

#### Regarding the selection of the number of clusters, based on the above metrics plots and the heuristic perception of the optimal ammount of clusters, K was set to 11 to group users in order to best adressing sincere results.

### 3) Clusterizing items with full database

In [None]:
# adjusting variable scale
df_kmeans_cluster_full = StandardScaler().fit_transform(df_movies_4kmeans_filtered)

# running Kmeans cosidering 11 centroids
kmeans = KMeans(n_clusters=11, init='k-means++', n_init=100, max_iter=5000, random_state=8).fit(df_kmeans_cluster_full)

# saving complete results with cluster labels to csv
df_movies_4kmeans.insert(0, 'movie_cluster', kmeans.labels_)

# saving clustering results only to csv
df_movies_4kmeans.reset_index(drop=True)
df_movies_4kmeans.to_csv('df_movies_clustered.csv', index=False)


In [None]:
# checking results
df_movies_4kmeans.head(2)

### 4) Generating input database for Fuzzy C-Means 

In [None]:
# importing user wrangled data
df_kmeans_results = df_movies_4kmeans[['movieId','movie_cluster']]
df_user_ratings = pd.read_csv('df_ratings_by_time.csv')
df_user_ratings = df_user_ratings[['userId', 'movieId', 'rating']]

# merging data to create preliminar df_sincere  dataset
df_sincere = df_user_ratings.merge(df_kmeans_results, on='movieId', how='left')
df_sincere.to_csv('df_sincere_database.csv', index=False)

# generate input for fuzzy c-means
df_sincere_grouped = pd.get_dummies(data=df_sincere, columns=['movie_cluster'])
df_sincere_grouped = df_sincere_grouped.drop('movieId', axis=1)
df_sincere_grouped = df_sincere_grouped.groupby('userId').sum().reset_index()
df_fuzzy_cmeans_input = df_sincere_grouped.drop(['userId','rating'], axis=1)

# storing fuzzy c-means input in a csv file 
df_fuzzy_cmeans_input.to_csv('df_users_4clustering.csv', index=False)

# checking results
df_fuzzy_cmeans_input

## C) Clusterizing users using fuzzy c-means

In [None]:
# shortcut | reading from stored intermediate results
#df_fuzzy_cmeans_input = pd.read_csv('df_users_4clustering.csv')

In [None]:
# using standard scaler to normalize data
df_fuzzy_cmeans = StandardScaler().fit_transform(df_fuzzy_cmeans_input)

In [None]:
#finding the best k
for i in range(2,10):
    fuzzycmeans = fuzz.cmeans(df_fuzzy_cmeans.T, c=i, m=1.7, error = 000.5, maxiter=5000, init=None, seed=1)
    cntr, u, u0, d, jm, p, fpc = fuzzycmeans
    print(i,'clusters fpc score:',fpc)

In [None]:
#Fitting the model with k=8
fuzzycmeans = fuzz.cmeans(df_fuzzy_cmeans.T, c=8, m=1.7, error = 000.5, maxiter=5000, init=None, seed=1)
cntr, u, u0, d, jm, p, fpc = fuzzycmeans

# transposing fuzzy into a matrix results
df_cmeans_full = pd.DataFrame(u.T)
df_cmeans_full.head(2)

In [None]:
# converting softclustering to hard clustering by labeling users by its user clusrter with higher fuzzy c-means index
cmeanslabels = np.argmax(u,axis=0)

# inserting fuzzy labels on df_cmeans
df_fuzzy_cmeans_output = df_sincere_grouped
df_fuzzy_cmeans_output.insert(0, 'user_cluster', cmeanslabels)
df_fuzzy_cmeans_output.head()

In [None]:
# storing results into csv files
df_fuzzy_cmeans_output.to_csv('df_users_clustered.csv', index=False)
df_cmeans_full.to_csv('df_users_fuzzyCmeans_matrix_results.csv', index=False)

## D) Sincere database

In [None]:
# shortcut | reading from stored intermediate results
#df_sincere = pd.read_csv('df_sincere_database.csv')
#df_fuzzy_cmeans_output = pd.read_csv('df_users_clustered.csv')

# updading sincere database to csv
df_fuzzy_results = df_fuzzy_cmeans_output[['userId', 'user_cluster']]
df_sincere = df_sincere.merge(df_fuzzy_results, on='userId', how='inner').dropna()
df_sincere.to_csv('df_sincere_database.csv', index=False)

# checking results
df_sincere