# What are my features?  
It’s pretty easy to infer the following features from the column names:

**ST_NUM:** Street number

**ST_NAME:** Street name

**OWN_OCCUPIED:** Is the residence owner occupied

**NUM_BEDROOMS:** Number of bedrooms

We can also answer, what are the expected types?

**ST_NUM:** float or int… some sort of numeric type

**ST_NAME:** string

**OWN_OCCUPIED:** string… Y (“Yes”) or N (“No”)

**NUM_BEDROOMS:** float or int, a numeric type

 <img src="https://raw.githubusercontent.com/fazlyrabbi77/DataProcessing/master/data-cleaning-from-ibm-analytics.jpg">

In [0]:
import pandas as pd
import seaborn as sns
import numpy as np

  import pandas.util.testing as tm


In [0]:
#reading the dataset
df = pd.read_csv('https://raw.githubusercontent.com/fazlyrabbi77/DataProcessing/master/real-estate.csv')

In [0]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [0]:
df.head(2)

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1.0,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--


In [0]:
#Renaming Columns with inplace
df.rename(columns = {"NUM_BEDROOMS": "BEDROOMS",  "NUM_BATH":"BATH"},inplace=True) 

In [0]:
df.head()

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,--
2,100003000.0,,LEXINGTON,N,,1.0,850
3,100004000.0,201.0,BERKELEY,12,1.0,,700
4,,203.0,BERKELEY,Y,3.0,2.0,1600


In [0]:

#Check Null
df.isnull().values.any()

True

In [0]:
#Count the number of Null values
df.isnull().values.sum()

8

In [0]:
#Showing the null values as per attributes
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

PID             1
ST_NUM          2
OWN_OCCUPIED    1
BEDROOMS        2
BATH            1
SQ_FT           1
dtype: int64

In [0]:
# Showing Null Values
print(df[df.isnull().any(axis=1)][null_columns].head())

           PID  ST_NUM OWN_OCCUPIED BEDROOMS    BATH SQ_FT
2  100003000.0     NaN            N      NaN       1   850
3  100004000.0   201.0           12        1     NaN   700
4          NaN   203.0            Y        3       2  1600
5  100006000.0   207.0            Y      NaN       1   800
6  100007000.0     NaN          NaN        2  HURLEY   950


In [0]:
#Dropping columns without inplace
df.drop(['PID'], axis=1)

Unnamed: 0,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,104.0,PUTNAM,Y,3,1,1000
1,197.0,LEXINGTON,N,3,1.5,--
2,,LEXINGTON,N,,1,850
3,201.0,BERKELEY,12,1,,700
4,203.0,BERKELEY,Y,3,2,1600
5,207.0,BERKELEY,Y,,1,800
6,,WASHINGTON,,2,HURLEY,950
7,213.0,TREMONT,Y,1,1,
8,215.0,TREMONT,Y,na,2,1800


In [0]:
#dropping rows without inplace
df.drop([7,8])

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1,1000
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1.0,,700
4,,203.0,BERKELEY,Y,3.0,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2.0,HURLEY,950


In [0]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [0]:
# Filling null values with specific value
df['PID'].fillna(100005000,inplace=True)
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [0]:
# Row wise data filling
df.loc[2,'ST_NUM'] = 197
df.loc[6,'ST_NUM'] = 208
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,197.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,208.0,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [0]:
df.head(3)

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,--
2,100003000.0,197.0,LEXINGTON,N,,1.0,850


In [0]:
#unwanted value treatment
cnt=0
for row in df['OWN_OCCUPIED']:
    try:
        int(row)
        df.loc[cnt, 'OWN_OCCUPIED']=np.nan
    except ValueError:
        pass
    cnt+=1
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [0]:
df['BEDROOMS'] = pd.to_numeric(df['BEDROOMS'], errors='coerce')
df['BATH'] = pd.to_numeric(df['BATH'], errors='coerce')
df['SQ_FT'] = pd.to_numeric(df['SQ_FT'], errors='coerce')
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,,2.0,,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,
8,100009000.0,215.0,TREMONT,Y,,2.0,1800.0


In [0]:
df['OWN_OCCUPIED'].fillna(df['OWN_OCCUPIED'].mode()[0], inplace=True)
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,
8,100009000.0,215.0,TREMONT,Y,,2.0,1800.0


In [0]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,
8,100009000.0,215.0,TREMONT,Y,,2.0,1800.0


In [0]:
#Group By parameter check
df.groupby('BEDROOMS')['SQ_FT'].median()

BEDROOMS
1.0     700.0
2.0     950.0
3.0    1300.0
Name: SQ_FT, dtype: float64

In [0]:
# Filling Null with group by vparameter
df['SQ_FT'] = df['SQ_FT'].fillna(df.groupby('BEDROOMS')['SQ_FT'].transform('median'))
df['SQ_FT'] = df['SQ_FT'].fillna(df['SQ_FT'].median())
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,1300.0
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,700.0
8,100009000.0,215.0,TREMONT,Y,,2.0,1800.0


In [0]:
df.loc[2,'BEDROOMS'] = 1
df.loc[5,'BEDROOMS'] = 1
df.loc[8,'BEDROOMS'] = 3
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,1300.0
2,100003000.0,197.0,LEXINGTON,N,1.0,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,1.0,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,700.0
8,100009000.0,215.0,TREMONT,Y,3.0,2.0,1800.0


In [0]:
# Filling Null with group by vparameter
df['BEDROOMS'] = df['BEDROOMS'].fillna(df.groupby('SQ_FT')['BEDROOMS'].transform('median'))
df['BATH'] = df['BATH'].fillna(df['BATH'].median())

In [0]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,1300.0
2,100003000.0,197.0,LEXINGTON,N,1.0,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,1.0,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,1.0,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,1.0,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,700.0
8,100009000.0,215.0,TREMONT,Y,3.0,2.0,1800.0


In [0]:
df['BATH']=df['BATH'].fillna(method='bfill')
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,1300.0
2,100003000.0,197.0,LEXINGTON,N,1.0,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,1.0,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,1.0,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,1.0,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,700.0
8,100009000.0,215.0,TREMONT,Y,3.0,2.0,1800.0


In [0]:
#Converting street number to int
df.PID = df.PID.astype('int64') 
df.ST_NUM = df.ST_NUM.astype('int64') 
df.BEDROOMS = df.BEDROOMS.astype('int64') 
df.BATH = df.BATH.astype('int64') 
df.SQ_FT = df.SQ_FT.astype('int64') 
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,PUTNAM,Y,3,1,1000
1,100002000,197,LEXINGTON,N,3,1,1300
2,100003000,197,LEXINGTON,N,1,1,850
3,100004000,201,BERKELEY,Y,1,1,700
4,100005000,203,BERKELEY,Y,3,2,1600
5,100006000,207,BERKELEY,Y,1,1,800
6,100007000,208,WASHINGTON,Y,2,1,950
7,100008000,213,TREMONT,Y,1,1,700
8,100009000,215,TREMONT,Y,3,2,1800


In [0]:
df2=df
df1=df

In [0]:
df2

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,PUTNAM,Y,3,1,1000
1,100002000,197,LEXINGTON,N,3,1,1300
2,100003000,197,LEXINGTON,N,1,1,850
3,100004000,201,BERKELEY,Y,1,1,700
4,100005000,203,BERKELEY,Y,3,2,1600
5,100006000,207,BERKELEY,Y,1,1,800
6,100007000,208,WASHINGTON,Y,2,1,950
7,100008000,213,TREMONT,Y,1,1,700
8,100009000,215,TREMONT,Y,3,2,1800


In [0]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder() 
df['ST_NAME']= le.fit_transform(df['ST_NAME']) 
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,Y,3,1,1000
1,100002000,197,1,N,3,1,1300
2,100003000,197,1,N,1,1,850
3,100004000,201,0,Y,1,1,700
4,100005000,203,0,Y,3,2,1600
5,100006000,207,0,Y,1,1,800
6,100007000,208,4,Y,2,1,950
7,100008000,213,3,Y,1,1,700
8,100009000,215,3,Y,3,2,1800


In [0]:
#use of mapping function
mapping = {'Y' :1 , 'N' : 2 }
df['OWN_OCCUPIED'] = df['OWN_OCCUPIED'].map(mapping)
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000
1,100002000,197,1,2,3,1,1300
2,100003000,197,1,2,1,1,850
3,100004000,201,0,1,1,1,700
4,100005000,203,0,1,3,2,1600
5,100006000,207,0,1,1,1,800
6,100007000,208,4,1,2,1,950
7,100008000,213,3,1,1,1,700
8,100009000,215,3,1,3,2,1800


In [0]:
#One Hot Encoding for nominal data
df1 = pd.get_dummies(df, columns=['OWN_OCCUPIED'])
df1

Unnamed: 0,PID,ST_NUM,ST_NAME,BEDROOMS,BATH,SQ_FT,OWN_OCCUPIED_1,OWN_OCCUPIED_2
0,100001000,104,2,3,1,1000,1,0
1,100002000,197,1,3,1,1300,0,1
2,100003000,197,1,1,1,850,0,1
3,100004000,201,0,1,1,700,1,0
4,100005000,203,0,3,2,1600,1,0
5,100006000,207,0,1,1,800,1,0
6,100007000,208,4,2,1,950,1,0
7,100008000,213,3,1,1,700,1,0
8,100009000,215,3,3,2,1800,1,0


In [0]:
#Scaling
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

df2 = pd.DataFrame(std.fit_transform(df.values.reshape(-1, 1)))

In [0]:
df2

Unnamed: 0,0
0,2.449375
1,-0.408251
2,-0.408254
3,-0.408254
4,-0.408254
...,...
58,-0.408254
59,-0.408254
60,-0.408254
61,-0.408254


In [0]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
scaled = pd.DataFrame(scaler.transform(df),columns=df.columns)
scaled

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,0.0,0.0,0.5,0.0,1.0,0.0,0.272727
1,0.125,0.837838,0.25,1.0,1.0,0.0,0.545455
2,0.25,0.837838,0.25,1.0,0.0,0.0,0.136364
3,0.375,0.873874,0.0,0.0,0.0,0.0,0.0
4,0.5,0.891892,0.0,0.0,1.0,1.0,0.818182
5,0.625,0.927928,0.0,0.0,0.0,0.0,0.090909
6,0.75,0.936937,1.0,0.0,0.5,0.0,0.227273
7,0.875,0.981982,0.75,0.0,0.0,0.0,0.0
8,1.0,1.0,0.75,0.0,1.0,1.0,1.0


In [0]:
df3=df

In [0]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000
1,100002000,197,1,2,3,1,1300
2,100003000,197,1,2,1,1,850
3,100004000,201,0,1,1,1,700
4,100005000,203,0,1,3,2,1600
5,100006000,207,0,1,1,1,800
6,100007000,208,4,1,2,1,950
7,100008000,213,3,1,1,1,700
8,100009000,215,3,1,3,2,1800


In [0]:
from sklearn.preprocessing import normalize
data_normalized = normalize(df)
data_normalized = pd.DataFrame(data_normalized, columns=df.columns)
data_normalized

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,1.0,1e-06,1.99998e-08,9.9999e-09,2.99997e-08,9.9999e-09,1e-05
1,1.0,2e-06,9.9998e-09,1.99996e-08,2.99994e-08,9.9998e-09,1.3e-05
2,1.0,2e-06,9.9997e-09,1.99994e-08,9.9997e-09,9.9997e-09,8e-06
3,1.0,2e-06,0.0,9.9996e-09,9.9996e-09,9.9996e-09,7e-06
4,1.0,2e-06,0.0,9.9995e-09,2.99985e-08,1.9999e-08,1.6e-05
5,1.0,2e-06,0.0,9.9994e-09,9.9994e-09,9.9994e-09,8e-06
6,1.0,2e-06,3.99972e-08,9.9993e-09,1.99986e-08,9.9993e-09,9e-06
7,1.0,2e-06,2.99976e-08,9.9992e-09,9.9992e-09,9.9992e-09,7e-06
8,1.0,2e-06,2.99973e-08,9.9991e-09,2.99973e-08,1.99982e-08,1.8e-05


In [0]:
#Scaling
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

standard= pd.DataFrame(std.fit_transform(df3.values.reshape(-1, 1)))

In [0]:
standard

Unnamed: 0,0
0,2.645630
1,-0.377967
2,-0.377970
3,-0.377970
4,-0.377970
...,...
67,-0.377970
68,-0.377970
69,-0.377970
70,-0.377916


In [0]:
import pandas as pd    
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df_scaledstd = pd.DataFrame(ss.fit_transform(df),columns = df.columns)
df_scaledstd

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,-1.549193,-2.779199,0.312348,-0.534522,1.06066,-0.534522,-0.206195
1,-1.161895,0.09619,-0.390434,1.870829,1.06066,-0.534522,0.589128
2,-0.774597,0.09619,-0.390434,1.870829,-1.06066,-0.534522,-0.603856
3,-0.387298,0.219862,-1.093216,-0.534522,-1.06066,-0.534522,-1.001517
4,0.0,0.281699,-1.093216,-0.534522,1.06066,1.870829,1.38445
5,0.387298,0.405371,-1.093216,-0.534522,-1.06066,-0.534522,-0.73641
6,0.774597,0.43629,1.717911,-0.534522,0.0,-0.534522,-0.338748
7,1.161895,0.59088,1.015129,-0.534522,-1.06066,-0.534522,-1.001517
8,1.549193,0.652717,1.015129,-0.534522,1.06066,1.870829,1.914665


Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,-1.549193,-2.779199,0.312348,-0.534522,1.06066,-0.534522,-0.206195
1,-1.161895,0.09619,-0.390434,1.870829,1.06066,-0.534522,0.589128
2,-0.774597,0.09619,-0.390434,1.870829,-1.06066,-0.534522,-0.603856
3,-0.387298,0.219862,-1.093216,-0.534522,-1.06066,-0.534522,-1.001517
4,0.0,0.281699,-1.093216,-0.534522,1.06066,1.870829,1.38445
5,0.387298,0.405371,-1.093216,-0.534522,-1.06066,-0.534522,-0.73641
6,0.774597,0.43629,1.717911,-0.534522,0.0,-0.534522,-0.338748
7,1.161895,0.59088,1.015129,-0.534522,-1.06066,-0.534522,-1.001517
8,1.549193,0.652717,1.015129,-0.534522,1.06066,1.870829,1.914665


In [0]:
from sklearn.preprocessing import RobustScaler
robust = RobustScaler()
robust_scaled_df = robust.fit_transform(df)
robust_scaled_df = pd.DataFrame(robust_scaled_df, columns=df.columns)
robust_scaled_df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,-1.0,-9.0,0.333333,0.0,0.5,0.0,0.1
1,-0.75,-0.545455,0.0,1.0,0.5,0.0,0.7
2,-0.5,-0.545455,0.0,1.0,-0.5,0.0,-0.2
3,-0.25,-0.181818,-0.333333,0.0,-0.5,0.0,-0.5
4,0.0,0.0,-0.333333,0.0,0.5,1.0,1.3
5,0.25,0.363636,-0.333333,0.0,-0.5,0.0,-0.3
6,0.5,0.454545,1.0,0.0,0.0,0.0,0.0
7,0.75,0.909091,0.666667,0.0,-0.5,0.0,-0.5
8,1.0,1.090909,0.666667,0.0,0.5,1.0,1.7


In [0]:
transposed_df=df.transpose()
transposed_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
PID,100001000,100002000,100003000,100004000,100005000,100006000,100007000,100008000,100009000
ST_NUM,104,197,197,201,203,207,208,213,215
ST_NAME,2,1,1,0,0,0,4,3,3
OWN_OCCUPIED,1,2,2,1,1,1,1,1,1
BEDROOMS,3,3,1,1,3,1,2,1,3
BATH,1,1,1,1,2,1,1,1,2
SQ_FT,1000,1300,850,700,1600,800,950,700,1800
