# Data Conversion Notebook
This module can be useful to store methods of converting, transforming, and imputing values for datasets.

In [65]:
import pandas as pd

In [66]:
train = pd.read_csv("train.csv")

In [77]:
class NumericDataFrame():
    """
    This class converts categorical types of a dataframe into numeric types.
    It also automatically imputes missing values as the row mean.
    """
    def __init__(self, df):
        self.type_objs = df.select_dtypes(exclude=['float64',"int64"])
        self.type_numeric = df.select_dtypes(include=['float64',"int64"])
        self.ordinal_list = []
        
    def impute(self, df):
        df = df.apply(lambda x: x.fillna(1),axis=0)
        return df

    def ordinal_to_numeric(self, df):
        for i in range(len(df.columns)-1):
            is_cat = df.iloc[:,i].values.any() in ("Gd","Ex","TA","Fa","Po")
            if is_cat == True:
                self.ordinal_list.append(df.columns[i])
                df[df.columns[i]] = df[df.columns[i]].map({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
        return df

    def get_transform_dict(self):
        transform_dict = {}
        for col in self.type_objs.columns:
            if col not in self.ordinal_list:
                cats = pd.Categorical(self.type_objs[col]).categories
                d = {}
                for i, cat in enumerate(cats):
                    d[cat] = i + 2
                transform_dict[col] = d
        return transform_dict

    def __call__(self):
        df = self.type_objs
        df = self.ordinal_to_numeric(df)
        df.replace(self.get_transform_dict(), inplace=True)
        df = pd.merge(self.type_numeric, df, left_index=True, right_index=True)
        df = self.impute(df)
        return df

In [78]:
# Example of calling instance of this class
train_numeric = NumericDataFrame(train)()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  regex=regex)


In [76]:
train_numeric

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,3.0,3.0,3.0,3.0,4,,,,10,6
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,3.0,3.0,3.0,3.0,4,,,,10,6
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,3.0,3.0,3.0,3.0,4,,,,10,6
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,7.0,4.0,3.0,3.0,4,,,,10,2
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,3.0,3.0,3.0,3.0,4,,,,10,6
5,6,50,85.0,14115,5,5,1993,1995,0.0,732,...,3.0,4.0,3.0,3.0,4,,4.0,4.0,10,6
6,7,20,75.0,10084,8,5,2004,2005,186.0,1369,...,3.0,3.0,3.0,3.0,4,,,,10,6
7,8,60,,10382,7,6,1973,1973,240.0,859,...,3.0,3.0,3.0,3.0,4,,,4.0,10,6
8,9,50,51.0,6120,7,5,1931,1950,0.0,0,...,7.0,4.0,2.0,3.0,4,,,,10,2
9,10,190,50.0,7420,5,6,1939,1950,0.0,851,...,3.0,3.0,4.0,3.0,4,,,,10,6
