# Customers Gender Inference - ML models

This Jupyter Notebook focuses on the predictive model and the labels output

In [1]:
# Import libraries

import pandas as pd
import numpy as np
from scipy import stats
pd.set_option('display.max_columns', None)

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import defaultdict


from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFECV

In [2]:
# Read the json file provided on the github page

df_json = pd.read_json('data.json', orient='record')

In [3]:
# Create the dataframe

df_raw = pd.read_json(df_json.iloc[0,0], orient='record')

## Creating the pipeline

In [4]:
def Clean_data(df_raw=df_raw):
    
    '''Transform df_raw into df_cleaned'''
    
    global df_cleaned
    
    df_cleaned = df_raw.copy()
    
    # Replace the negative 'revenue' value by 0
    df_cleaned.loc[df_cleaned['revenue']<0,'revenue'] = 0

    # Replace each 'revenue' having 0 by the median of revenue for its specific number of items (1 to 13)
    for i in df_cleaned.loc[df_cleaned['revenue']==0,'items'].value_counts().index:  
        replace = df_cleaned.loc[df_cleaned['revenue'] != 0].groupby('items')['revenue'].median()[i]
        df_cleaned.loc[(df_cleaned['revenue']==0) & (df_cleaned['items']==i),'revenue'] = replace

    # Convert 'is_newsletter_subscriber' into 1 or 0 (1='Yes', 0 ='No').
    df_cleaned['is_newsletter_subscriber'] = np.where(df_cleaned['is_newsletter_subscriber']=='Y',1,0)

    # Create another feature for the average of revenue spend per item
    df_cleaned['avg_revenue_per_item'] = df_cleaned['revenue']/df_cleaned['items']
    
    # Create another feature for the number of returns
    df_cleaned['returns_per_item'] = df_cleaned['returns']*100/df_cleaned['items']

    # Create another feature for the number of items per order
    df_cleaned['items_per_order'] = df_cleaned['items']/df_cleaned['orders']

    # Create another feature for the online orders
    df_cleaned['online_orders'] = (df_cleaned['msite_orders'] + df_cleaned['desktop_orders'] + df_cleaned['android_orders'] + df_cleaned['ios_orders'])/ df_cleaned['orders']

    # Create another feature for the shipped items
    df_cleaned['shipped_items'] = (df_cleaned['work_orders'] + df_cleaned['home_orders'] + df_cleaned['parcelpoint_orders'])/ df_cleaned['orders']

    # Create another feature for female items
    df_cleaned['f_items'] = df_cleaned['female_items'] + df_cleaned['wapp_items'] + df_cleaned['wacc_items'] + df_cleaned['wftw_items']

    # Create another feature for male items
    df_cleaned['m_items'] = df_cleaned['male_items'] + df_cleaned['mapp_items'] + df_cleaned['macc_items'] + df_cleaned['mftw_items']

    # Create another feature for the sum of payment method
    df_cleaned['electronic_payment'] = (df_cleaned['cc_payments'] + df_cleaned['paypal_payments'] + df_cleaned['afterpay_payments'])/ df_cleaned['orders']
        
    df_cleaned = df_cleaned[['days_since_last_order', 'orders', 'avg_revenue_per_item', 'f_items','m_items','returns_per_item']]


In [5]:
def Create_X_y(df_cleaned, m=10, r=4):    
    
    '''Create X and y to fit the machine learning model'''
    
    global X_
    
    X_ = df_cleaned.copy()
    
    # Keep only obvious records which we can guess the label
    X_ = df_cleaned.loc[(df_cleaned['m_items']+df_cleaned['f_items']>=m) & (df_cleaned[['f_items','m_items']].max(axis=1)/df_cleaned[['f_items','m_items']].min(axis=1)>=r)]

    # Remove outliers
    X_ = X_.loc[(np.abs(stats.zscore(X_))<3).all(axis=1)]
    
    # Assign labels
    X_.loc[df_cleaned['f_items']/X_['m_items']>= r,'label_created'] = 1
    X_.loc[df_cleaned['m_items']/X_['f_items']>= r,'label_created'] = 0

    # Balance male and female
    remove_sample = X_['label_created'].value_counts().max() - X_['label_created'].value_counts().min()
    drop_indices = np.random.choice(X_.loc[X_['label_created']==X_['label_created'].value_counts().index[0]].index, remove_sample, replace=False)
    X_ = X_.drop(drop_indices)         
    
    # Define y_ as the column of labels
    global y_
    y_ = X_['label_created'].values
    
    # Drop labels for X_
    X_.drop('label_created',axis=1, inplace=True)
    
    # Trying to normalize our data
    X_ = np.sqrt(X_.astype('float'))
    X_ = X_.values


In [6]:
def predict(df_cleaned):
    
    '''Predict labels'''
    
    global df_predicted

    labels = pd.Series(pipe.predict(df_cleaned), name='label')
    df_predicted = pd.concat([df_raw,labels], axis=1)
    df_predicted = pd.concat([df_cleaned[['avg_revenue_per_item','f_items','m_items','returns_per_item']],df_predicted], axis=1)
    

In [7]:
# Let's use SVC

pipe = make_pipeline(StandardScaler(), SVC(kernel='linear'))

## Predicting labels

In [8]:
Clean_data() # --> Transform df_raw into df_cleaned
Create_X_y(df_cleaned) # --> Create X_ and y_ 
pipe.fit(X_, y_) # --> Use X_ and y_ to fit the model
predict(df_cleaned) # --> apply to model to df_cleaned to predict labels

## Exporting output file

In [9]:
output = df_predicted[['customer_id','label']].copy()
output.rename(columns={'label':'female_flag'}, inplace=True)
output

Unnamed: 0,customer_id,female_flag
0,3.017001e+09,0.0
1,3.017108e+09,1.0
2,3.017126e+09,1.0
3,3.017185e+09,1.0
4,3.017193e+09,0.0
...,...,...
191282,3.706982e+09,1.0
191283,3.706984e+09,1.0
191284,3.706994e+09,0.0
191285,3.706998e+09,0.0


In [10]:
output.to_csv('/Users/brice/Desktop/GFG_output.csv', index = None)

---------------------END---------------------

Thanks a lot for reading