In [1]:
%%capture
%cd ../../

In [2]:
import sys
sys.path.extend(["recommender/src"])

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
#from omegaconf import DictConfig, OmegaConf
from sklearn.model_selection import train_test_split
import json
import joblib

##utils import
from utils.encodes import gen_encode_cols
from utils.io import gen_dict,drop_cols, merge_dfs,drop_cols_list
from utils.metrics_report import get_multiclass_report

##src
from feature_engineering import group_feats, count_selected_options
from majorityvote import build_majorityvote
from feature_selection import get_feat_importance

In [6]:
current_dir = Path.cwd()
raw_data_dir = current_dir.joinpath('recommender/data/raw')
processed_data_dir = current_dir.joinpath('recommender/data/processed')
config_dir = current_dir.joinpath('recommender/configs')
artifacts_dir = current_dir.joinpath('recommender/models/artifacts')
features_dir = current_dir.joinpath('recommender/models/features')
filename = 'kaggle_survey_2017_2021.csv'

In [7]:
full_data_dir = processed_data_dir.joinpath('full-data')
train_dir =  processed_data_dir.joinpath('train-data')
test_dir = processed_data_dir.joinpath('test-data')
models_dir = current_dir.joinpath('model/artifacts/')

In [None]:
## config calls
map_config= OmegaConf.load(config_dir.joinpath("col-mapping.yaml"))
colvals_config = OmegaConf.load(config_dir.joinpath("col-values.yaml"))

In [None]:
sampled_data = pd.read_csv(full_data_dir.joinpath('Features_KaggleResponses_v4.csv'))
sampled_data = sampled_data.fillna(0)
sampled_data.shape

In [None]:
## features selected from chisquare test
with open(features_dir.joinpath('features_select_v4.json')) as feat_file:
    file_contents = feat_file.read()
features = (json.loads(file_contents)).values()
len(features)

In [None]:
features

In [None]:
## Train Test Split
X = sampled_data[features].values
y = sampled_data['Target'].values

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify= y, random_state=42)
# test is now 20% of the initial data set
print(x_train.shape, x_test.shape)

train_data = pd.DataFrame(x_train, columns = features)
test_data = pd.DataFrame(x_test, columns = features)

train_data['Target'] = y_train
test_data['Target'] = y_test



# train_data.to_csv(train_dir.joinpath('KaggleResponses-Train.csv'),index= False)
# test_data.to_csv(test_dir.joinpath('KaggleResponses-Test.csv'),index= False)


In [9]:
test_data = pd.read_csv(test_dir.joinpath('KaggleResponses-Test.csv'))
test_data

Unnamed: 0,What is your current yearly compensation (approximate $USD)?,Uses Computer Vision Algorithms Count,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Doctorate,Uses ML Framework Count,Uses ML Algorithms Count,"Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice",What type of computing platform do you use most often for your data science projects? - Selected Choice,In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice,For how many years have you used machine learning methods?,Uses NLP Algorithms Count,...,What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice,Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all that apply) - Selected Choice - MATLAB,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Java,"Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice - Dense Neural Networks (MLPs, etc)","Which categories of computer vision methods do you use on a regular basis? (Select all that apply) - Selected Choice - Object detection methods (YOLOv3, RetinaNet, etc)",Which of the following cloud computing platforms do you use on a regular basis? (Select all that apply) - Selected Choice - Amazon Web Services (AWS),"Select any activities that make up an important part of your role at work: (Select all that apply) - Selected Choice - Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data",What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python,Approximately how many times have you used a TPU (tensor processing unit)?,Target
0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,1.000000,0.0,0.000000,0.0,0.000000,0.0,1.000000,1.000000,0.000000,0
1,2.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,3.0,4.000000,0.000000,...,3.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,1.000000,0.000000,2
3,8.000000,0.000000,0.000000,2.000000,2.000000,0.000000,2.0,0.0,2.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,1.000000,0.000000,1.000000,2
4,26.000000,0.000000,0.000000,2.000000,0.000000,0.000000,0.0,1.0,3.000000,0.000000,...,0.000000,0.0,1.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31865,20.579914,2.319657,0.000000,3.000000,3.579914,4.840171,2.0,0.0,8.000000,0.000000,...,1.260257,0.0,0.000000,0.0,0.579914,1.0,0.579914,0.420086,1.000000,4
31866,13.000000,0.000000,0.000000,1.000000,2.000000,0.000000,0.0,0.0,6.000000,0.000000,...,0.000000,0.0,1.000000,0.0,0.000000,1.0,1.000000,1.000000,1.000000,6
31867,13.000000,2.000000,0.000000,2.000000,4.000000,2.000000,4.0,0.0,7.000000,0.000000,...,0.000000,0.0,1.000000,1.0,0.000000,1.0,0.000000,0.000000,2.000000,6
31868,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.0,1.0,2.000000,0.000000,...,0.000000,2.0,0.000000,0.0,0.000000,0.0,1.000000,0.000000,3.000000,2


### Train model for FastApi

In [None]:
def train_model(train_data: pd.DataFrame) -> None:
    X = train_data.iloc[:, :-1].to_numpy()
    y = train_data.iloc[:, -1:].to_numpy()
    params =  {'n_estimators': 463, 'max_depth': 50}
    model = RandomForestClassifier(**params, random_state=42)
    model.fit(X, y)
    joblib.dump(model, artifacts_dir.joinpath("Persona-Multiclass.joblib"))
    return

def test_model(test_data: pd.DataFrame) -> np.array:
    model_file = artifacts_dir.joinpath("Persona-Multiclass.joblib")
    if not model_file.exists():
        return False
    model = joblib.load(model_file) 
    predictions = model.predict()
    return predictions
    
    


In [None]:
test_data.iloc[:,:-1].values[0]

In [15]:
import requests

url = 'https://localhost:8080/get'
obj = {'test_data': test_data.iloc[:,:-1].values[0]}
x = requests.get(url, data= obj)

ConnectionError: HTTPSConnectionPool(host='localhost', port=8080): Max retries exceeded with url: /get (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff2c209e590>: Failed to establish a new connection: [Errno 111] Connection refused'))