In [18]:
# Importing Required Libraries

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import json
from datetime import datetime
import collections 
import sklearn.metrics as sm
%matplotlib inline

# Step 1 - Load Data

In [19]:
# Json to CSV

jsonData = []
fileName = "yelp_academic_dataset_user.json"
count=0
with open(fileName,  encoding="utf8") as file:    
    for line in file:
        jsonData.append(json.loads(line.rstrip()))
        count+=1
        if count==100000:break

df = pd.DataFrame.from_dict(jsonData)

# Saving in .csv format
csvFileName = fileName[:len(fileName)-5] + '.csv'

df.to_csv(csvFileName)
print('{0} created'.format(csvFileName))

#Loading CSV File
df = pd.read_csv(r"yelp_academic_dataset_user.csv")

yelp_academic_dataset_user.csv created


# Step 2 - Preprocess the data

In [20]:
# Prepare Data

# Dropping index column
df = df.drop(df.iloc[:,:1],axis=1) 

# Fill Missing Values
df = df.fillna({"elite":""})

# Drop not useful columns
df = df.drop(["name","user_id"],axis=1)

df.head()

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,elite,fans,friends,funny,review_count,useful,yelping_since
0,4.03,1,0,1,2,0,0,1,0,1,0,2,25,201520162017.0,5,"c78V-rj8NQcQjOI8KP3UEA, alRMgPcngYSCJ5naFRBz5g...",17,95,84,2013-10-08 23:11:33
1,3.63,1,0,1,1,0,0,0,0,0,0,0,16,,4,"kEBTgDvFX754S68FllfCaA, aB2DynOxNOJK9st2ZeGTPg...",22,33,48,2013-02-21 22:29:06
2,3.71,0,0,0,0,0,0,1,0,0,0,0,10,,0,"4N-HU_T32hLENLntsNKNBg, pSY2vwWLgWfGVAAiKQzMng...",8,16,28,2013-10-04 00:16:10
3,4.85,0,0,0,1,0,0,0,0,2,0,1,14,,5,"RZ6wS38wnlXyj-OOdTzBxA, l5jxZh1KsgI8rMunm-GN6A...",4,17,30,2014-05-22 15:57:30
4,4.08,80,0,80,28,1,1,16,5,57,0,25,665,2015201620172018.0,39,"mbwrZ-RS76V1HoJ0bF_Geg, g64lOV39xSLRZO0aQQ6DeQ...",279,361,1114,2013-10-23 07:02:50


In [21]:
#Data conversion

#Convert date time of yelping_since to days elapsed
now= datetime.now()
date_now = now.strftime("%Y-%m-%d %H:%M:%S")
date_now = pd.to_datetime(date_now)
df['yelping_since'] = df['yelping_since'].apply(lambda x: pd.to_datetime(x))
df['yelping_since'] = df['yelping_since'].apply(lambda x: (date_now - x).days)


#Converting elite to years of being an elite member
df["elite"] = df["elite"].apply(lambda x: len(x.split(",")))

# Converting friends to number of friends
df["friends"] = df["friends"].apply(lambda x: len(x.split(",")))


# Applying log to all compliment_columns as they are skewed
df[["compliment_cool","compliment_cute","compliment_funny","compliment_hot","compliment_list","compliment_more","compliment_note","compliment_photos","compliment_plain","compliment_profile","compliment_writer"]] = df[["compliment_cool","compliment_cute","compliment_funny","compliment_hot","compliment_list","compliment_more","compliment_note","compliment_photos","compliment_plain","compliment_profile","compliment_writer"]].replace(0,0.001)
df[["compliment_cool","compliment_cute","compliment_funny","compliment_hot","compliment_list","compliment_more","compliment_note","compliment_photos","compliment_plain","compliment_profile","compliment_writer"]] = np.log(df[["compliment_cool","compliment_cute","compliment_funny","compliment_hot","compliment_list","compliment_more","compliment_note","compliment_photos","compliment_plain","compliment_profile","compliment_writer"]])


df.head()


Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,elite,fans,friends,funny,review_count,useful,yelping_since
0,4.03,0.0,-6.907755,0.0,0.693147,-6.907755,-6.907755,0.0,-6.907755,0.0,-6.907755,0.693147,25,3,5,99,17,95,84,2217
1,3.63,0.0,-6.907755,0.0,0.0,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,16,1,4,1152,22,33,48,2446
2,3.71,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,0.0,-6.907755,-6.907755,-6.907755,-6.907755,10,1,0,15,8,16,28,2222
3,4.85,-6.907755,-6.907755,-6.907755,0.0,-6.907755,-6.907755,-6.907755,-6.907755,0.693147,-6.907755,0.0,14,1,5,525,4,17,30,1991
4,4.08,4.382027,-6.907755,4.382027,3.332205,0.0,0.0,2.772589,1.609438,4.043051,-6.907755,3.218876,665,4,39,231,279,361,1114,2203


In [22]:
# Check the ranges now

df.describe()

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,elite,fans,friends,funny,review_count,useful,yelping_since
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,3.798689,-3.762603,-6.089045,-3.762603,-4.435585,-6.462168,-5.00048,-3.440197,-5.152945,-3.093121,-5.943684,-4.239514,133.01009,1.5016,7.14677,111.70364,104.43967,82.55847,211.54633,2594.26531
std,0.733975,4.25068,2.419811,4.25068,3.946402,1.841567,3.386685,4.13423,3.414022,4.27063,2.59464,3.95656,1149.299821,1.449441,38.507842,285.776341,941.570493,201.292528,1353.683818,906.50035
min,1.0,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,0.0,1.0,0.0,1.0,0.0,0.0,0.0,532.0
25%,3.45,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,1.0,1.0,0.0,8.0,1.0,9.0,6.0,1897.0
50%,3.86,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,-6.907755,6.0,1.0,1.0,37.0,5.0,24.0,23.0,2586.0
75%,4.25,0.693147,-6.907755,0.693147,0.0,-6.907755,-6.907755,0.693147,-6.907755,0.693147,-6.907755,0.0,27.0,1.0,3.0,113.0,23.0,71.0,85.0,3220.0
max,5.0,9.870344,7.720462,9.870344,10.439016,7.723562,8.18172,8.933137,10.572726,9.608311,8.641179,9.028938,83915.0,13.0,2964.0,9564.0,86122.0,12390.0,89792.0,5501.0


# Step 3 - Standardize the data

In [23]:
# Standardize Data

# scaler = MinMaxScaler()
scaler = StandardScaler()
df[["average_stars","compliment_cool","compliment_cute","compliment_funny","compliment_hot","compliment_list",
    "compliment_more","compliment_note","compliment_photos","compliment_plain","compliment_profile","compliment_writer",
    "cool","fans","funny","review_count","useful","yelping_since","elite","friends"]] = scaler.fit_transform(df[["average_stars","compliment_cool","compliment_cute","compliment_funny","compliment_hot","compliment_list",
    "compliment_more","compliment_note","compliment_photos","compliment_plain","compliment_profile","compliment_writer",
    "cool","fans","funny","review_count","useful","yelping_since","elite","friends"]])


df.head()


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,elite,fans,friends,funny,review_count,useful,yelping_since
0,0.315149,0.885181,-0.338338,0.885181,1.299604,-0.241962,-0.563172,0.832129,-0.514003,0.724281,-0.371565,1.246711,-0.09398,1.033783,-0.055749,-0.044453,-0.092866,0.061809,-0.094222,-0.41618
1,-0.229831,0.885181,-0.338338,0.885181,1.123962,-0.241962,-0.563172,-0.838748,-0.514003,-0.89323,-0.371565,-0.674388,-0.10181,-0.346066,-0.081718,3.640265,-0.087556,-0.246202,-0.120816,-0.163559
2,-0.120835,-0.739921,-0.338338,-0.739921,-0.62644,-0.241962,-0.563172,0.832129,-0.514003,-0.89323,-0.371565,-0.674388,-0.107031,-0.346066,-0.185594,-0.338391,-0.102425,-0.330657,-0.135591,-0.410664
3,1.432358,-0.739921,-0.338338,-0.739921,1.123962,-0.241962,-0.563172,-0.838748,-0.514003,0.886588,-0.371565,1.071521,-0.103551,-0.346066,-0.055749,1.44623,-0.106673,-0.325689,-0.134113,-0.665492
4,0.383272,1.916086,-0.338338,1.916086,1.968332,3.509078,1.476519,1.502775,1.980777,1.670997,-0.371565,1.885079,0.462884,1.723708,0.827192,0.417449,0.185394,1.383275,0.666668,-0.431624


# Step 4 - Reduce the dimensions -using PCA

In [24]:
# Calculate Covariance Matrix
cleaned_df = df
cleaned_df.cov()

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,elite,fans,friends,funny,review_count,useful,yelping_since
average_stars,1.00001,0.018173,0.006056,0.018173,0.024623,-0.004484,-0.014816,-0.058045,0.030607,-0.013144,0.003711,-0.001061,0.005353,0.008906,0.010585,0.044003,-0.000784,-0.008103,-0.000856,-0.023931
compliment_cool,0.018173,1.00001,0.513582,1.00001,0.723735,0.441347,0.636486,0.661493,0.638295,0.669553,0.566681,0.714647,0.248623,0.575408,0.339518,0.32665,0.23604,0.511214,0.295842,0.414976
compliment_cute,0.006056,0.513582,1.00001,0.513582,0.563095,0.560584,0.513253,0.458436,0.544833,0.460831,0.584288,0.506172,0.338487,0.540088,0.4384,0.412731,0.326561,0.507083,0.37934,0.325459
compliment_funny,0.018173,1.00001,0.513582,1.00001,0.723735,0.441347,0.636486,0.661493,0.638295,0.669553,0.566681,0.714647,0.248623,0.575408,0.339518,0.32665,0.23604,0.511214,0.295842,0.414976
compliment_hot,0.024623,0.723735,0.563095,0.723735,1.00001,0.478488,0.637864,0.630189,0.648651,0.635798,0.595944,0.699583,0.27032,0.606999,0.366612,0.356103,0.25703,0.519419,0.31755,0.367488
compliment_list,-0.004484,0.441347,0.560584,0.441347,0.478488,1.00001,0.46767,0.395894,0.463668,0.397642,0.568107,0.447547,0.396847,0.569277,0.476958,0.418557,0.386975,0.551108,0.439964,0.296751
compliment_more,-0.014816,0.636486,0.513253,0.636486,0.637864,0.46767,1.00001,0.579669,0.56485,0.579467,0.572309,0.637434,0.263554,0.570757,0.351748,0.326518,0.25277,0.509803,0.310704,0.38168
compliment_note,-0.058045,0.661493,0.458436,0.661493,0.630189,0.395894,0.579669,1.00001,0.569222,0.64335,0.507532,0.647583,0.223339,0.511177,0.307462,0.28824,0.213842,0.478787,0.269567,0.41252
compliment_photos,0.030607,0.638295,0.544833,0.638295,0.648651,0.463668,0.56485,0.569222,1.00001,0.578087,0.567878,0.624754,0.302489,0.585667,0.399193,0.389176,0.284115,0.532498,0.347945,0.255931
compliment_plain,-0.013144,0.669553,0.460831,0.669553,0.635798,0.397642,0.579467,0.64335,0.578087,1.00001,0.509093,0.65508,0.227987,0.512382,0.312549,0.29954,0.217388,0.48165,0.273895,0.390164


In [26]:
# Apply PCA to reduce dimensions based on covariance and understanding of the matrix

df1 = df[['cool','funny','useful']]
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(df1)
principalDf_cool_funny_useful = pd.DataFrame(data = principalComponents, columns = ['cool_funny_useful'])
#print(pca.explained_variance_ratio_)

df2 = df[['compliment_cool','compliment_cute','compliment_funny','compliment_hot','compliment_list','compliment_more','compliment_note','compliment_photos','compliment_plain','compliment_profile','compliment_writer']]   
pca2 = PCA(n_components=1)
principalComponents2 = pca2.fit_transform(df2)
principalDf_compliment_features = pd.DataFrame(data = principalComponents2, columns = ['compliment_features'])
#print(pca2.explained_variance_ratio_)

df3 = df[['elite','review_count']]   
pca3 = PCA(n_components=1)
principalComponents3 = pca3.fit_transform(df3)
principalDf_elite_review_count = pd.DataFrame(data = principalComponents3, columns = ['elite_reviewCount'])
#print(pca3.explained_variance_ratio_)



In [27]:
# Combining features based on results from PCA

df = df.drop(["cool","funny","useful"],axis=1)
df["review_feedback_factor"] = principalDf_cool_funny_useful

df = df.drop(['compliment_cool','compliment_cute','compliment_funny','compliment_hot','compliment_list','compliment_more','compliment_note','compliment_photos','compliment_plain','compliment_profile','compliment_writer'],axis=1)
df['compliment_feedback'] = principalDf_compliment_features

df = df.drop(["elite","review_count"],axis=1)
df["elite_reviewCount"] = principalDf_elite_review_count


In [28]:
df.head()

Unnamed: 0,average_stars,fans,friends,yelping_since,review_feedback_factor,compliment_feedback,elite_reviewCount
0,0.315149,-0.055749,-0.044453,-0.41618,-0.162276,1.297267,0.7747
1,-0.229831,-0.081718,3.640265,-0.163559,-0.179115,-0.351366,-0.418797
2,-0.120835,-0.185594,-0.338391,-0.410664,-0.199221,-1.511711,-0.478516
3,1.432358,-0.055749,1.44623,-0.665492,-0.198793,-0.354125,-0.475003
4,0.383272,0.827192,0.417449,-0.431624,0.759812,5.151737,2.196969


# Step 5 - Applying Clustering on the data

In [29]:
# Cluster the data using K-Means 

km = KMeans(init='k-means++', n_clusters=5, n_init=300, random_state=10) 
km = km.fit(df)


In [30]:
centroids = km.cluster_centers_
centroids

array([[ 9.00673624e-03, -2.77096899e-02, -7.22310670e-03,
         4.68612298e-01, -9.05932250e-02,  2.14016170e+00,
         1.82747824e-01],
       [ 1.84296390e-03,  9.18768784e-01,  9.19615165e-01,
         1.02116017e+00,  9.57114071e-01,  6.22166063e+00,
         2.94172507e+00],
       [-3.69213506e-03, -1.68542994e-01, -1.67477157e-01,
        -2.74232979e-01, -2.06074356e-01, -1.46390613e+00,
        -4.49165879e-01],
       [ 7.46805866e-02,  7.93000747e+00,  7.71957045e+00,
         1.17127968e+00,  1.21530516e+01,  9.70172282e+00,
         7.03239229e+00],
       [ 8.41122809e-02,  2.29244872e+01,  1.38264544e+01,
         1.21544799e+00,  5.57563689e+01,  1.13220221e+01,
         1.11877351e+01]])

In [39]:
#For getting insights from clusters

# print(centroids[0])
# print("max", max(centroids[0]))
# print("min", min(centroids[0]))
# print(sorted(centroids[0]))
# print("----------------")
# print(centroids[1])
# print("max", max(centroids[1]))
# print("min", min(centroids[1]))
# print(sorted(centroids[1]))
# print("----------------")
# print(centroids[2])
# print("max", max(centroids[2]))
# print("min", min(centroids[2]))
# print(sorted(centroids[2]))
# print("----------------")
# print(centroids[3])
# print("max", max(centroids[3]))
# print("min", min(centroids[3]))
# print(sorted(centroids[3]))
# print("----------------")
# print(centroids[4])
# print("max", max(centroids[4]))
# print("min", min(centroids[4]))
# print(sorted(centroids[4]))
# print("----------------")

# avg_stars	--fans	friends-	yelping_since	cool_funny_useful	compliment_feautures	elite_reviewCount

In [33]:
km.predict(df)

array([0, 2, 2, ..., 2, 2, 2])

In [35]:
# kmeans = KMeans(init='k-means++', n_clusters=5, n_init=300, random_state=10) 
# km = kmeans.fit_predict(df)

In [36]:
# Cluster percentages

# print(len(km[km==0]))
# print(len(km[km==1]))
# print(len(km[km==2]))
# print(len(km[km==3]))
# print(len(km[km==4]))

22185
7824
69433
511
47


# Step 6 - Understanding clusters and findings

## Cluster Percentages:

* Cluster 0 ~ 22.2 % 
* Cluster 1 ~ 7.8 %
* Cluster 2 ~ 69.4 %
* Cluster 3 ~ 0.5 %
* Cluster 4 ~ 0.05%


## Understanding based on centroids: 

* Users from long time: From cluster 0 - Approx 22 % of users are very well complimented and are members of yelp from quite a good time but their reviews were not much useful for customers

* High star rated and less complimented users: From cluster 2 - 69% of users have high star of reviews comparably and who have large number of friends, fans but received less compliments and are only little helpful

* From other clusters:

* Moderately helpful: More than 7% of users are highly complimented,low star rating, received decent number of reviews are  little helpful for customers 

* Highly Useful : Less than 1% of users have received high compliments and are extremely helpful for customers who also have good number of friends as well as fans


## Summary : 

* Users who received high compliments are only little helpful for customers
* Majority of users are not helpul for other customers
* Users who received high compliments have good number of friends and fans
* Only 1 % of users were very helpful to other customers and also they have good number of friends and followers
* Users who are helpul have good number of friends and fans
* Users who received good number of compliments were elite members for decent amount of years and have decent number of      reviews. (decent= average)
* Users who are yelping from long time received more compliments than other users
* Users who have high review feedback score but low star rating are bit helpful to customers

# Methods used to find k - number of clusters (Optional)

In [None]:
# Elbow Method

# def calculate_WSS(points, kmax):
#   sse = []
#   for k in range(1, kmax+1):
#     kmeans = KMeans(n_clusters = k, n_init=200).fit(points)
#     centroids = kmeans.cluster_centers_
#     pred_clusters = kmeans.predict(points)
#     curr_sse = 0
    
#     # calculate square of Euclidean distance of each point from its cluster center and add to current WSS
#     for i in range(len(points)):
#       curr_center = centroids[pred_clusters[i]]
#       curr_sse += (points[i, 0] - curr_center[0]) ** 2 + (points[i, 1] - curr_center[1]) ** 2
      
#     sse.append(curr_sse)
#   return sse

In [None]:
# elbow = calculate_WSS(df.values,20)
# elbow

In [None]:
# fig,ax = plt.subplots(1)

# # create some x data and some integers for the y axis
# x = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
# y = elbow
# # plot the data
# ax.plot(x,y)

In [None]:
# Silhoutte_Score

# from sklearn.metrics import silhouette_score

# sil = []
# kmax = 15

# # dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
# for k in range(2, kmax+1):
#     kmeans = KMeans(n_clusters = k).fit(df)
#     labels = kmeans.labels_        
#     sil.append(silhouette_score(df, labels, metric = 'euclidean'))
# sil

In [None]:
# fig,ax = plt.subplots(1)

# # create some x data and some integers for the y axis
# x = np.array([2,3,4,5,6,7,8,9,10,11,12,13,14,15])
# y = sil
# # plot the data
# ax.plot(x,y)