# Import data from csv and store in local after preprocessing

## Imports and global declarations

In [2]:
!pip3 freeze > "../requirements.txt"
#!pip3 install -r "../requirements.txt"  # giving some error

from google.cloud import bigquery
import pandas as pd
import numpy as np
import pandas_gbq
import copy
import glob
import re
import pickle
import datetime as dt
from datetime import timezone
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500) 


## Load raw data from a csv

In [3]:
# . is any character except new line, he.{2} all should match where we have 2 characters after he
# .*he mean any number of character before he
a = [re.search(r'\d{2}-\d{2}-\d{2}', x).group(0) for x in glob.glob("../data/raw/1.0*")]
b = sorted([dt.datetime.strptime(x,"%d-%m-%y") for x in a])
file_path=[val for val in glob.glob("../data/raw/1.0*") if re.match('.*' + b[-1].strftime('%d-%m-%y'),val)][0]
print(f"Loading csv file from {file_path}")
global_data = pd.read_csv(file_path, index_col=0)

Loading csv file from ../data/raw/1.0-um-data-prep-all-01-08-22.csv


  global_data = pd.read_csv(file_path, index_col=0)


## Preprocessing, numerical coding , normalization, knn-imputing

In [4]:
predictors = ['acc_lci_score', 'seniority_score', 'num_challenges', 'passed_num_challenges','total_problems','attempted_problems','num_correct','mean_dev_percentile','num_passed_stack', 'num_passed_skill', 'characters_in_reume','years_of_experience','top_skill_supply','top_skill_demand', 'top_stack_supply', 'correct_per_challenge', 'correct_per_questions', 'english_communication','quiz_answer']
target     = ['paying_cust'] 
dev_id     = ['dev_id']
#ml_data = global_data.loc[~global_data['num_challenges'].isna(),].copy()
ml_data_v2 = global_data[predictors+target+dev_id]
num_cols   = [col for col in ml_data_v2.columns if ml_data_v2[col].dtype!='object']
obj_cols   = [col for col in ml_data_v2.columns if ml_data_v2[col].dtype=='object']

X      = ml_data_v2[predictors].copy()
Y      = ml_data_v2[target]
dev_id = ml_data_v2[dev_id]
X_copy = copy.deepcopy(X)

for col in (obj_cols):
    X[col] = X[col].astype('category')
    X[col] = X[col].cat.codes

print(X.shape, Y.shape, dev_id.shape)

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

knn_imp = KNNImputer(n_neighbors=5, add_indicator=False)
knn_imp.fit(rescaledX)
X_final = pd.DataFrame(knn_imp.transform(rescaledX))
X_final.columns = predictors


(49337, 19) (49337, 1) (49337, 1)


acc_lci_score            0
seniority_score          0
num_challenges           0
passed_num_challenges    0
total_problems           0
attempted_problems       0
num_correct              0
mean_dev_percentile      0
num_passed_stack         0
num_passed_skill         0
characters_in_reume      0
years_of_experience      0
top_skill_supply         0
top_skill_demand         0
top_stack_supply         0
correct_per_challenge    0
correct_per_questions    0
english_communication    0
quiz_answer              0
dtype: int64

## Stroing KNNImputer model

In [5]:
now = dt.datetime.now()
current_time = now.strftime("%d-%m-%y") # %H:%M:%S")
filename = '../models/1.2-knnmputer-model-' + current_time+ '.sav'
pickle.dump(knn_imp, open(filename, 'wb'))

## Store and version process data

In [6]:
data_dump = pd.concat([dev_id, Y, X_final], axis=1)

if data_dump['dev_id'].duplicated().any():
    print('Processed Data All has duplicated dev_id')
else:
    print(f"Processed data for all devs stored in a csv with shape {data_dump.shape}")
    now = dt.datetime.now()
    current_time = now.strftime("%d-%m-%y") # %H:%M:%S")
    data_dump.to_csv('../data/processed/' + '1.2-um-data-process-all-' + current_time+'.csv')

Processed data for all devs stored in a csv with shape (49337, 21)
