In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("score_lowcost").getOrCreate()
sc = spark.sparkContext

## 1. Loading data

In [3]:
from functions.data import load_data

In [4]:
ldata = load_data(spark, "data_clients/data_to_load/")

In [5]:
ldata.keys()

dict_keys(['sample_histo_train.csv', 'sample_visites.csv', 'sample_histo_lowcost.csv', 'sample_perimetre.csv', 'sample_histo_client.csv'])

In [6]:
ldata_keys = list(ldata.keys())
perimetre = ldata[ldata_keys[3]]
histo_client_raw = ldata[ldata_keys[4]]
histo_train_raw = ldata[ldata_keys[0]]
histo_lowcost_raw = ldata[ldata_keys[2]]
visites_raw = ldata[ldata_keys[1]]

## 2. Casting data fromat

In [7]:
from functions.conversion import cast_columns_of_df

In [8]:
client_cols_to_keep = ["ID_CLIENT", 'LBL_STATUT_CLT','LBL_GEO_AIR','LBL_SEG_COMPORTEMENTAL',
                       'LBL_GEO_TRAIN','LBL_GRP_SEGMENT_NL',
                        'LBL_SEGMENT_ANTICIPATION','FLG_CMD_CARTE_1225']

visites = cast_columns_of_df(visites_raw, visites_raw.columns,["ID_CLIENT"], cast_type='double')
histo_train = cast_columns_of_df(histo_train_raw, histo_train_raw.columns,["ID_CLIENT"], cast_type='double')
histo_lowcost = cast_columns_of_df(histo_lowcost_raw, histo_lowcost_raw.columns,["ID_CLIENT"], cast_type='double')
histo_client = cast_columns_of_df(histo_client_raw,["anciennete", "recence_cmd", "AGE"],client_cols_to_keep,cast_type='double')

## 3. Joining the tables

In [9]:
from functions.data import join_data

In [10]:
col_to_keep = "ID_CLIENT"
lowcost = join_data([histo_train, histo_lowcost, visites, histo_client], col_to_keep)

## 4. NaN Handling

In [11]:
from functions.features import input_df

In [12]:
df = input_df(lowcost)

## 5. Mlib

In [13]:
from functions.features import preprocessed_df

### 5.1 Features engineering

In [14]:
data, dff = preprocessed_df(df)
data_ = data.select("indexedFeatures","flg_cmd_lowcostIndex")

### 5.2 Logistic classification scoring

In [15]:
from functions.models import eval_lr

In [16]:
accuracy, auc = eval_lr(data_)

In [17]:
accuracy, auc

(0.9658908358509567, 0.9067793525054108)

### 5.3 Random Forest scoring <span style="color:red">Still not working</span>

In [18]:
#from pyspark.ml.classification import RandomForestClassifier