## Import data processed data for new devs and predict label

## Imports and global declarations

In [1]:
!pip3 freeze > "../requirements.txt"
#!pip3 install -r "../requirements.txt"  # giving some error

from google.cloud import bigquery
import pandas as pd
import numpy as np
import pandas_gbq
import copy
import copy
import pickle
import glob
import re
import datetime as dt
from datetime import timezone
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500) 



## Load process data from csv 1.3

In [2]:
# . is any character except new line, he.{2} all should match where we have 2 characters after he
# .*he mean any number of character before he
a_predict = [re.search(r'\d{2}-\d{2}-\d{2}', x).group(0) for x in glob.glob("../data/processed/1.3*")]
b_predict = sorted([dt.datetime.strptime(x,"%d-%m-%y") for x in a_predict])
file_path = [val for val in glob.glob("../data/processed/1.3*") if re.match('.*' + b_predict[-1].strftime('%d-%m-%y'),val)][0]
print(f"Loading prediction csv file from {file_path}")
predict_data = pd.read_csv(file_path, index_col=0)

Loading prediction csv file from ../data/processed/1.3-um-data-process-predict-08-08-22.csv


## Loading knn clustering model and performing prediction

In [3]:
a = [re.search(r'\d{2}-\d{2}-\d{2}', x).group(0) for x in glob.glob("../models/2.0-knn-clustering-model*")]
b = sorted([dt.datetime.strptime(x,"%d-%m-%y") for x in a])
clustering_model_path = [val for val in glob.glob("../models/2.0-knn-clustering-model*") if re.match('.*' + b[-1].strftime('%d-%m-%y'),val)][0]
print(f"Clustering model loaded from {clustering_model_path}")
clustering_model = pickle.load(open(clustering_model_path, 'rb'))
predict_data['cluster'] = clustering_model.predict(predict_data.loc[:, ~predict_data.columns.isin(['dev_id', 'paying_cust'])])

Clustering model loaded from ../models/2.0-knn-clustering-model-20-07-22.sav


## Loading cluster lable and perform mapping

In [4]:
a = [re.search(r'\d{2}-\d{2}-\d{2}', x).group(0) for x in glob.glob("../models/2.0-knn-cluster-label*")]
b = sorted([dt.datetime.strptime(x,"%d-%m-%y") for x in a])
cluster_label_path = [val for val in glob.glob("../models/2.0-knn-cluster-label*") if re.match('.*' + b[-1].strftime('%d-%m-%y'),val)][0]
print(f"Cluster label loaded from {cluster_label_path}")
cluster_label = pickle.load(open(cluster_label_path, 'rb'))
predict_data['label'] = predict_data['cluster'].map(cluster_label)
label_id_correction_old_model = {'Elite':4, 'High Quality': 2,  'Average': 1, 'Low Quality':3,'Low Experience':0}
predict_data['cluster'] = predict_data['label'].map(label_id_correction_old_model)

Cluster label loaded from ../models/2.0-knn-cluster-label-20-07-22.pckl


In [5]:
print('Distribution of predicted cluster')
print(predict_data.groupby(['cluster', 'label']).agg({'dev_id':'nunique'}).reset_index().rename(columns={'dev_id':'Number of devs'}))

Distribution of predicted cluster
   cluster           label  Number of devs
0        0  Low Experience             226
1        1         Average             149
2        2    High Quality              68
3        3     Low Quality             115
4        4           Elite               4


## Pushing cluster lable to GBQ table for new devs

In [6]:
v2_cols = ['dev_id', 'cluster', 'cluster_label', 'date_created']
df = predict_data[['dev_id', 'cluster', 'label']].copy()
df.rename(columns={'label':'cluster_label'},inplace=True)
df['date_created'] = dt.datetime.now(timezone.utc)

if df['dev_id'].duplicated().any():
    print('label data has duplicated dev_id')
else:
    pandas_gbq.to_gbq(df, 'pdsa.PDAS_P2_cluster', project_id='turing-dev-337819', if_exists='append')
    print(f"Cluster label for new devs pushed in GBQ table with shape {df.shape}")

100%|██████████| 1/1 [00:00<00:00, 1260.69it/s]

Cluster label for new devs pushed in GBQ table with shape (562, 4)



