In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_df = pd.read_csv('./fake_data_with_cluster.csv')
df = pd.read_csv('./fake_user_cluster.csv')

In [3]:
df = df.drop(columns=['Unnamed: 0', 'phone_number','Farmertype', 'district', 'Own_animal_large', 'Own_animal_small','Own_poultry'],axis=1)
df = df.fillna("[]")

In [4]:
def convert_to_list(x):
    tmp = [y.strip() for y in x.strip('][').replace("'","").split(',')]
    if len(tmp[0])==0:
        return []
    return tmp

def find_uniques(lst):
    uniq = set()
    for itm in lst:
        itm = set(convert_to_list(itm))
        uniq.update(itm)
    return uniq

In [5]:
activities = find_uniques(df['Activities'].to_list())
khareef_crops = find_uniques(df['kharif_crops'].to_list())

In [6]:
all_features = list(activities) + list(khareef_crops)
feat_to_index = {f:i for i,f in enumerate(all_features)}

In [7]:
user_data = np.zeros((len(df),len(all_features)))

In [8]:
for idx,row in df.iterrows():
    act = convert_to_list(row['Activities'])
    khr = convert_to_list(row['kharif_crops'])
    for itm in act+khr:
        itm_idx = feat_to_index[itm]
        user_data[idx,itm_idx] = 1

In [9]:
data = np.hstack([df['cluster'].to_numpy().reshape(-1,1),user_data])
all_columns = ['cluster']+all_features
new_df = pd.DataFrame(data,columns=all_columns)
new_df['num_farmer'] = 1

In [10]:
user_cluster = new_df.groupby(['cluster']).sum()

In [11]:
# for crops find fraction
for col in list(user_cluster.columns)[4:-1]:
    user_cluster[f'{col}_percent'] = user_cluster[col]*100/user_cluster['num_farmer']
    user_cluster[col] = user_cluster[col]*100/sum(user_cluster[col])

In [15]:
# find average ranking for each cluster
data_df = data_df.drop(columns=['user'],axis=1)
cluster_data_df = data_df.groupby('cluster').mean()

In [91]:
def f(x):
    curr_values = cluster_data_df.iloc[x].to_dict()
    fig = plt.figure(figsize=(20,10))
    plt.subplot(3,1,1)
    plt.plot(curr_values.keys(),curr_values.values())
    plt.title(f'Cluster {x}')
    plt.xlabel('items')
    plt.ylabel('ranking')
    plt.xticks(ticks=[])
    curr_values = user_cluster.iloc[x].to_dict()
    plt.subplot(3,1,2)
    plt.plot(list(curr_values.keys())[4:33],list(curr_values.values())[4:33])
    plt.title(f'khareef crops grown by cluster {x} vs total cluster')
    plt.xlabel('Crop')
    plt.ylabel('Percentage')
    plt.xticks(ticks=[])
    plt.subplot(3,1,3)
    plt.plot([x.split('_')[0] for x in list(curr_values.keys())[33:]],list(curr_values.values())[33:])
    num_farmer = curr_values['num_farmer']
    plt.title(f'Percentage crop grown by cluster {x} total farmer {num_farmer}')
    plt.xlabel('Crop')
    plt.ylabel('Percentage')
    plt.xticks(rotation=90)

In [92]:
from ipywidgets import interact
import ipywidgets as widgets
f = interact(f,x=widgets.IntSlider(min=0, max=20, step=1, value=0))

interactive(children=(IntSlider(value=0, description='x', max=20), Output()), _dom_classes=('widget-interact',…