<a href="https://colab.research.google.com/github/Euan-J-Austin/Analytics_and_Intelligence/blob/main/Imputing_income_with_Miss_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install missingpy



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from sklearn.impute import KNNImputer

In [None]:
df = pd.read_csv('marketing_campaign.csv')

In [None]:
#'Education' values are Dtype object, convert to Dtype categorical

df['Education'] = df['Education'].astype('category')

In [None]:
df['Education'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2240 entries, 0 to 2239
Series name: Education
Non-Null Count  Dtype   
--------------  -----   
2240 non-null   category
dtypes: category(1)
memory usage: 2.5 KB


In [None]:
#Rename 2n Cycle to Master

df['Education'] = df['Education'].cat.rename_categories({'Master': '2n Cycle'})

ValueError: Categorical categories must be unique

In [None]:
#Okay, this doesn't work because "Categorical categories must be unique."

In [None]:
#Return to object dtype
df['Education'] = df['Education'].astype('object')

In [None]:
df['Education'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2240 entries, 0 to 2239
Series name: Education
Non-Null Count  Dtype 
--------------  ----- 
2240 non-null   object
dtypes: object(1)
memory usage: 17.6+ KB


In [None]:
df['Education'] = df['Education'].map({'Basic': 'Basic',
                                       'Graduation': 'Graduation',
                                       '2n Cycle': 'Master',
                                       'Master': 'Master',
                                       'PhD': 'PhD'})

In [None]:
df

Unnamed: 0.1,Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
0,0,5524,1957,Graduation,Single,58138.0,0,0,04/09/2012,58,...,10,4,7,0,0,0,0,0,0,1
1,1,2174,1954,Graduation,Single,46344.0,1,1,08/03/2014,38,...,1,2,5,0,0,0,0,0,0,0
2,2,4141,1965,Graduation,Together,71613.0,0,0,21/08/2013,26,...,2,10,4,0,0,0,0,0,0,0
3,3,6182,1984,Graduation,Together,26646.0,1,0,10/02/2014,26,...,0,4,6,0,0,0,0,0,0,0
4,4,5324,1981,PhD,Married,58293.0,1,0,19/01/2014,94,...,3,6,5,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,2235,10870,1967,Graduation,Married,61223.0,0,1,13/06/2013,46,...,3,4,5,0,0,0,0,0,0,0
2236,2236,4001,1946,PhD,Together,64014.0,2,1,10/06/2014,56,...,2,5,7,0,0,0,1,0,0,0
2237,2237,7270,1981,Graduation,Divorced,56981.0,0,0,25/01/2014,91,...,3,13,6,0,1,0,0,0,0,0
2238,2238,8235,1956,Master,Together,69245.0,0,1,24/01/2014,8,...,5,10,3,0,0,0,0,0,0,0


In [None]:
df['Education'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2240 entries, 0 to 2239
Series name: Education
Non-Null Count  Dtype 
--------------  ----- 
2240 non-null   object
dtypes: object(1)
memory usage: 17.6+ KB


In [None]:
df.loc[df['Education'] == '2n Cycle']

Unnamed: 0.1,Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response


In [None]:
#Cool, no more 2n Cycle which were multicolinear with Masters

In [None]:
df = pd.get_dummies(df, columns = ['Education'], drop_first=True)

In [None]:
df = pd.get_dummies(df, columns = ['Marital_Status'], drop_first=True)

In [None]:
df.loc[df['Marital_Status_YOLO'] == 1]

Unnamed: 0.1,Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,Education_Graduation,Education_Master,Education_PhD,Marital_Status_Alone,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_YOLO
2177,2177,492,1973,48432.0,0,1,18/10/2012,3,322,3,...,0,0,1,0,0,0,0,0,0,1
2202,2202,11133,1973,48432.0,0,1,18/10/2012,3,322,3,...,0,0,1,0,0,0,0,0,0,1


In [None]:
#Could rename to single,alone but we will see how robust RF is

In [None]:
join_date = pd.to_datetime(df['Dt_Customer'], format = "%d/%m/%Y")

In [None]:
df['Join_date'] = join_date

In [None]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,Education_Master,Education_PhD,Marital_Status_Alone,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_YOLO,Join_date
0,0,5524,1957,58138.0,0,0,04/09/2012,58,635,88,...,0,0,0,0,0,1,0,0,0,2012-09-04
1,1,2174,1954,46344.0,1,1,08/03/2014,38,11,1,...,0,0,0,0,0,1,0,0,0,2014-03-08
2,2,4141,1965,71613.0,0,0,21/08/2013,26,426,49,...,0,0,0,0,0,0,1,0,0,2013-08-21
3,3,6182,1984,26646.0,1,0,10/02/2014,26,11,4,...,0,0,0,0,0,0,1,0,0,2014-02-10
4,4,5324,1981,58293.0,1,0,19/01/2014,94,173,43,...,0,1,0,0,1,0,0,0,0,2014-01-19


In [None]:
df = df.drop(['Unnamed: 0', 'ID', 'Dt_Customer'], axis = 1)

In [None]:
df

Unnamed: 0,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,...,Education_Master,Education_PhD,Marital_Status_Alone,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_YOLO,Join_date
0,1957,58138.0,0,0,58,635,88,546,172,88,...,0,0,0,0,0,1,0,0,0,2012-09-04
1,1954,46344.0,1,1,38,11,1,6,2,1,...,0,0,0,0,0,1,0,0,0,2014-03-08
2,1965,71613.0,0,0,26,426,49,127,111,21,...,0,0,0,0,0,0,1,0,0,2013-08-21
3,1984,26646.0,1,0,26,11,4,20,10,3,...,0,0,0,0,0,0,1,0,0,2014-02-10
4,1981,58293.0,1,0,94,173,43,118,46,27,...,0,1,0,0,1,0,0,0,0,2014-01-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,1967,61223.0,0,1,46,709,43,182,42,118,...,0,0,0,0,1,0,0,0,0,2013-06-13
2236,1946,64014.0,2,1,56,406,0,30,0,0,...,0,1,0,0,0,0,1,0,0,2014-06-10
2237,1981,56981.0,0,0,91,908,48,217,32,12,...,0,0,0,1,0,0,0,0,0,2014-01-25
2238,1956,69245.0,0,1,8,428,30,214,80,30,...,1,0,0,0,0,0,1,0,0,2014-01-24


In [None]:
#Getting dummies was necessary for dealing with MissForest's inability to convert string to float

In [None]:
#Okay, miss forest can't convert date time to a float so what I could do is find the maximum date and set the max date as an ordinal point then measure all other dates by distance in days from that date

In [None]:
(df['Join_date'].max() - df['Join_date'].min()).days

699

In [None]:
futr_col = []

def days_from_present(dt):
    for x in dt:
        x = (dt.max() - x).days
        futr_col.append(x)

In [None]:
days_from_present(df['Join_date'])

In [None]:
df['days'] = futr_col

In [None]:
df = df.drop(['Join_date'], axis=1)

In [None]:
df.head(0)

Unnamed: 0,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,...,Education_Master,Education_PhD,Marital_Status_Alone,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_YOLO,days


In [None]:
imputer = MissForest() #miss forest
X_imputed = imputer.fit_transform(df)
X_imputed = pd.DataFrame(X_imputed, columns = df.columns).round(1)

  warn(


Iteration: 0


  warn(


Iteration: 1


  warn(


Iteration: 2


In [None]:
X_imputed.iloc[[2078]] #testing if previously NaN imputed

Unnamed: 0,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,...,Education_Master,Education_PhD,Marital_Status_Alone,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_YOLO,days
2078,1971.0,40780.0,1.0,1.0,82.0,71.0,1.0,16.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,483.0


In [None]:
X_imputed.to_csv('fullcustomerinfo.csv')

#saves to present folder