# Import data from csv and store in local after preprocessing

## Imports and global declarations

In [1]:
!pip3 freeze > "../requirements.txt"
#!pip3 install -r "../requirements.txt"  # giving some error

from google.cloud import bigquery
import pandas as pd
import numpy as np
import pandas_gbq
import copy
import copy
import pickle
import glob
import re
import datetime as dt
from datetime import timezone
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500) 



## Load raw data from csv 1.1

In [2]:
# . is any character except new line, he.{2} all should match where we have 2 characters after he
# .*he mean any number of character before he
a_predict = [re.search(r'\d{2}-\d{2}-\d{2}', x).group(0) for x in glob.glob("../data/raw/1.1*")]
b_predict = sorted([dt.datetime.strptime(x,"%d-%m-%y") for x in a_predict])
file_path = [val for val in glob.glob("../data/raw/1.1*") if re.match('.*' + b_predict[-1].strftime('%d-%m-%y'),val)][0]
print(f"Loading prediction csv file from {file_path}")
predict_data = pd.read_csv(file_path, index_col=0)

Loading prediction csv file from ../data/raw/1.1-um-data-prep-predict-09-08-22.csv


In [3]:
# . is any character except new line, he.{2} all should match where we have 2 characters after he
# .*he mean any number of character before he
a = [re.search(r'\d{2}-\d{2}-\d{2}', x).group(0) for x in glob.glob("../data/raw/1.0*")]
b = sorted([dt.datetime.strptime(x,"%d-%m-%y") for x in a])
raw_data_file_path = [val for val in glob.glob("../data/raw/1.0*") if re.match('.*' + b[-1].strftime('%d-%m-%y'),val)][0]
print(f"Loading All data csv file from {raw_data_file_path}")
global_data = pd.read_csv(raw_data_file_path, index_col=0)

Loading All data csv file from ../data/raw/1.0-um-data-prep-all-04-08-22.csv


In [4]:
overalapping_devs = len(set(predict_data.dev_id.values).intersection(set(global_data.dev_id.values)))
print(f"Checking if we already have a cluster for a dev : {overalapping_devs}") # Check
if overalapping_devs==0:
    predict_data.to_csv(raw_data_file_path, mode='a', header=False)

Checking if we already have a cluster for a dev : 0


## Preprocessing, numerical coding , normalization
- For this step, combining new dev data with old one (for which we already have cluster) and then do numerical coding and normalization at the end performing knn imputation for new data

In [5]:
data = pd.concat([global_data, predict_data])

predictors = ['acc_lci_score', 'seniority_score', 'num_challenges', 'passed_num_challenges','total_problems','attempted_problems','num_correct','mean_dev_percentile','num_passed_stack', 'num_passed_skill', 'characters_in_reume','years_of_experience','top_skill_supply','top_skill_demand', 'top_stack_supply', 'correct_per_challenge', 'correct_per_questions', 'english_communication','quiz_answer']
target     = ['paying_cust'] 
dev_id     = ['dev_id']


ml_data_v2 = data[predictors+target+dev_id]
num_cols   = [col for col in ml_data_v2.columns if ml_data_v2[col].dtype!='object']
obj_cols   = [col for col in ml_data_v2.columns if ml_data_v2[col].dtype=='object']

X      = ml_data_v2[predictors].copy()
Y      = ml_data_v2[target]
dev_id = ml_data_v2[dev_id]

for col in (obj_cols):
    X[col] = X[col].astype('category')
    X[col] = X[col].cat.codes

print(X.shape, Y.shape, dev_id.shape)

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
rescaledX = pd.DataFrame(rescaledX)
rescaledX.columns = predictors


temp_data = pd.concat([dev_id.reset_index(drop=True),Y.reset_index(drop=True), rescaledX],axis=1)
norm_predict_data = temp_data.loc[temp_data['dev_id'].isin(list(predict_data.dev_id.values)),]
print(norm_predict_data.shape)


(50805, 19) (50805, 1) (50805, 1)
(136, 21)


## Loading imputer model for imputation


In [8]:
import glob
import re
import datetime as dt
a = [re.search(r'\d{2}-\d{2}-\d{2}', x).group(0) for x in glob.glob("../models/1.2-knnmputer-model*")]
b = sorted([dt.datetime.strptime(x,"%d-%m-%y") for x in a])
knnimpu_path=[val for val in glob.glob("../models/1.2-knnmputer-model*") if re.match('.*' + b[-1].strftime('%d-%m-%y'),val)][0]
print(knnimpu_path)
loaded_model = pickle.load(open(knnimpu_path, 'rb'))
X_final = pd.DataFrame(loaded_model.transform(norm_predict_data[predictors].to_numpy()))
X_final.columns = predictors

../models/1.2-knnmputer-model-01-08-22.sav


## Store and version process data

In [7]:
data_dump = pd.concat([norm_predict_data[['dev_id', 'paying_cust']].reset_index(drop=True), X_final], axis=1)

if data_dump['dev_id'].duplicated().any():
    print('Processed Data Predicted has duplicated dev_id')
else:
    print(f"Processed data for predicted devs stored in a csv with shape {data_dump.shape}")
    now = dt.datetime.now()
    current_time = now.strftime("%d-%m-%y") # %H:%M:%S")
    data_dump.to_csv('../data/processed/' + '1.3-um-data-process-predict-' + current_time+'.csv')

Processed data for predicted devs stored in a csv with shape (562, 21)
