<h1>WayFair Competition Jupyter Notebook Submission</h1>

Author: Alec James

<h3>Data Analysis Pipeline:</h3>

<b>Classes</b>
- [x] Preprocessing
- [x] Cleaning
- [ ] Modeling
- [ ] Visualization
- [ ] Insights




In [222]:
# Standard libraries
import pandas as pd  # Dataframes
import numpy as np  # arrays
import matplotlib.pyplot as plt  # plotting visuals

<h2>Class Preprocessing</h2>

The purpose of this class is to hold all my data preprocessing needs such as: importing, cleaning, finding useful statistics, basic visualization.

<h3>Functions and brief descriptions</h3>

- [ ] \__init__(self)
- [ ] logit(self, logString)
- [ ] getDf(self)
- [ ] getLog(self)
- [ ] printLog(self)
- [ ] printList(self)


- [ ] dataImport(self, filename)


- [ ] categoryFeatures(self)
- [ ] categoryTotals(self, columnList)
- [ ] categoryFeaturePositiveComparisons(self, columnList, *compareColumns)
- [ ] categoryStats(self, categoryTotals, compareColumn)




In [219]:
class Preprocessing():
    
    '''
    Initiation, log, get, and functions
    '''
    def __init__(self):
        self.df = -1  # main dataframe
        self.log = []  # class log list
        self.logCount = 0  # used for debugging
        
    # Log class actions
    def logit(self, logString):
        s = "LOG:" + str(self.logCount) + " "
        s += (logString)
        self.log.append(s)
        self.logCount += 1   
    
    # Get global DataFrame
    def getDf(self):
        return self.df
    
    # Get global log
    def getLog(self):
        return self.log
    
    def printLog(self):
        self.printList(self.log)
    
    # Print a list object with x amount of columns, default=1
    def printList(self, objList):
        for i, each in enumerate(objList):
            print(each)
            
            
    '''
    Import 
    '''
    # Imports file into a pandas dataframe
    def dataImport(self, filename):
        # Class log block
        if (type(self.df) == pd.DataFrame): 
            self.logit("dataImport- Overwritting class data")
        else: 
            self.logit("dataImport- Importing file")
        
        # Pandas csv data import
        try:
            self.df = pd.read_csv(filename, index_col=0)
        except:
            pass    
        return self.df
    
    
    '''
    Basic statistics functions about data    
    '''
    # Returns the names of all columns with categorical data 
    def categoryFeatures(self):
        categoryColumns = []
        for cat in range(self.df.shape[1]):
            if self.df.dtypes[cat] == 'object':  #object dtypes are categorical
                categoryColumns.append(self.df.columns[cat])
        
        self.logit("CategoryFeatures- " + str(categoryColumns))  # Log
        return(categoryColumns)
    
    # Totals the number of unique categories within each column and returns a count of how many in each category
    def categoryTotals(self, columnList):
        categoryTotals = []
        
        for i, category in enumerate(columnList):
            categoryTotalsDict = {}
            
            for u, uniqueCategoryInCol in enumerate(self.df[category].unique()):
                categoryTotalsDict[uniqueCategoryInCol] = self.df[category].value_counts()[uniqueCategoryInCol]
            
            categoryTotals.append(category)    
            categoryTotals.append(categoryTotalsDict) 
        
        self.logit("categoryTotals- ")
        return categoryTotals
    
    # Cross tabs between each of in the columnList with sum of each from *compareColumns
    def categoryFeaturePositiveComparisons(self, columnList, *compareColumns):
        assert(len(compareColumns) > 0)  
        compareColumnsListForEach = [] 
        
        for column in compareColumns: 
            assert(self.df.dtypes[column] != "object")  # must be a numbered type
            compareColumnsListForEach.append([])

        for category in columnList:
            others = []
            for each in compareColumns:
                others.append({})
                
            for uniqueCategoryInCol in self.df[category].unique():
                for i, cmpCol in enumerate(compareColumns):
                    tempDfBool = (self.df[cmpCol]>0) & (self.df[category] == uniqueCategoryInCol) # boolean df of all the positives
                    tempDfCats = self.df[tempDfBool] # df of just positives 
                    others[i][uniqueCategoryInCol] = tempDfCats[cmpCol].sum().round(2) # sum of the column row
            
            for i, each in enumerate(compareColumns): 
                compareColumnsListForEach[i].append(category)
                compareColumnsListForEach[i].append(others[i])
        self.logit("categoryFeaturePositiveComparisons- ")
        return compareColumnsListForEach
            
    # Function that takes the contents of two lists of dictionaries and divides compareColumn by categoryTotals
    def categoryStats(self, categoryTotals, compareColumn):
        average = []
        for totals, item in zip(categoryTotals, compareColumn):
            averageDict = {}            
            if type(item) == dict:
                for x in item:
                    if(item.get(x) == 0):
                        averageDict[x] = 0
                    else:
                        averageDict[x] = item.get(x) / totals.get(x)
                average.append(averageDict)
        self.logit("categoryStats- ")
        return average

        
        
pre = Preprocessing()
pre.dataImport('Training and Holdout Data/small.csv')
categoryColumns = pre.categoryFeatures()
categoryTotals = pre.categoryTotals(categoryColumns)
compareColumnsListForEach = pre.categoryFeaturePositiveComparisons(categoryColumns, 'convert_30', 'revenue_30')
percentLikelihood = pre.categoryStats(categoryTotals, compareColumnsListForEach[0])
averagePerPopulation = pre.categoryStats(categoryTotals, compareColumnsListForEach[1])
averagePerBuyer = pre.categoryStats(compareColumnsListForEach[0], compareColumnsListForEach[1])


print("----Categorical Columns----")
pre.printList(categoryColumns)

print("\n----Categorical Total for each Categorical Column----")
pre.printList(categoryTotals)

print("\n----Buyers Count in each unique category----")
pre.printList(compareColumnsListForEach[0])

print("\n----Buyers Revenue in each unique category----")
pre.printList(compareColumnsListForEach[1])

print("\n----Percent Likelihood someone in that category will buy----")
pre.printList(percentLikelihood)

print("\n----Average Spending of someone in that category----")
pre.printList(averagePerPopulation)

print("\n----Average Spending of someone in that category----")
pre.printList(averagePerPopulation)


----Categorical Columns----
roll_up
currentstatus
companytypegroup
team
customersource
accrole
num_employees
num_purchases_year
cost_purchases_year
enrollmentmethod

----Categorical Total for each Categorical Column----
roll_up
{'Onboarding': 26}
currentstatus
{'Active': 8, 'Enrolled': 18}
companytypegroup
{'Business': 23, 'Trade': 3}
team
{'US': 25, 'CA': 1}
customersource
{'External Application': 6, 'Internal Application': 9, 'Internal Customer Scrape': 5, 'Gateway': 1, 'Social - Paid': 3, 'Search - Paid': 2}
accrole
{'None': 20, 'Primary': 5, 'Purchaser': 1}
num_employees
{'50plus': 3, '2to5': 6, '1': 4, '6to10': 3, 'None': 10}
num_purchases_year
{'1to2': 1, '25plus': 3, '3to5': 6, '11to25': 1, 'None': 13, '6to10': 2}
cost_purchases_year
{'lessthan1': 3, '25to100': 2, '1to5': 3, '5to25': 4, 'None': 14}
enrollmentmethod
{'directEIN': 13, 'email': 5, 'other': 5, 'phone': 3}

----Buyers Count in each unique category----
roll_up
{'Onboarding': 2}
currentstatus
{'Active': 1, 'Enrolled': 