In [1]:
import pandas as pd
from sklearn import preprocessing
import plotly 
import plotly.graph_objs as go
import cufflinks as cf
import plotly.tools as tls
import plotly.figure_factory as ff

plotly.offline.init_notebook_mode(connected=True)

In [2]:
!head -n 5 "data/Carseats_org.csv"

"","Sales","CompPrice","Income","Advertising","Population","Price","ShelveLoc","Age","Education","Urban","US"
"1",9.5,138,73,11,276,120,"Bad",42,17,"Yes","Yes"
"2",11.22,111,48,16,260,83,"Good",65,10,"Yes","Yes"
"3",10.06,113,35,10,269,80,"Medium",59,12,"Yes","Yes"
"4",7.4,117,100,4,466,97,"Medium",55,14,"Yes","Yes"


In [3]:
df = pd.read_csv('data/Carseats_org.csv')
df = df.drop(['Unnamed: 0'], axis=1)

In [4]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [5]:
print('Total dimensions: ', len(df.columns), '\n')

print(df.dtypes)

Total dimensions:  11 

Sales          float64
CompPrice        int64
Income           int64
Advertising      int64
Population       int64
Price            int64
ShelveLoc       object
Age              int64
Education        int64
Urban           object
US              object
dtype: object


In [6]:
def scaleNumeric(df, inplace=True):
    scaler = preprocessing.StandardScaler()
    num_cols = df.select_dtypes(exclude=['object'])
    
    if inplace:
        for col in num_cols:
            df[[col]] = scaler.fit_transform(df[[col]])
        return df
    else:
        for col in num_cols:
            new_name = str(col) + '_scaled'
            df[new_name] = scaler.fit_transform(df[[col]])
        return df 

df = scaleNumeric(df, True)
df.head()


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to float64 by StandardScaler.


Data with input dtype int64 were all converted to f

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,0.710376,0.850455,0.155361,0.657177,0.075819,0.177823,Bad,-0.699782,1.184449,Yes,Yes
1,1.320179,-0.912484,-0.73906,1.409957,-0.032882,-1.386854,Good,0.721723,-1.490113,Yes,Yes
2,0.908917,-0.781896,-1.204159,0.506621,0.028262,-1.513719,Medium,0.350895,-0.725953,Yes,Yes
3,-0.034151,-0.52072,1.121336,-0.396715,1.366649,-0.794814,Medium,0.103677,0.038208,Yes,Yes
4,-1.186395,1.046337,-0.166631,-0.547271,0.510625,0.516132,Bad,-0.947,-0.343872,Yes,No


In [7]:
def encodeCategorical(df):
    le = preprocessing.LabelEncoder()
    cat_cols = df.select_dtypes(include=['object'])
    
    for col in cat_cols:
        new_name = str(col) + '_encoded'
        df[new_name] = le.fit_transform(cat_cols[col])
        
    return df

df = encodeCategorical(df)
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,ShelveLoc_encoded,Urban_encoded,US_encoded
0,0.710376,0.850455,0.155361,0.657177,0.075819,0.177823,Bad,-0.699782,1.184449,Yes,Yes,0,1,1
1,1.320179,-0.912484,-0.73906,1.409957,-0.032882,-1.386854,Good,0.721723,-1.490113,Yes,Yes,1,1,1
2,0.908917,-0.781896,-1.204159,0.506621,0.028262,-1.513719,Medium,0.350895,-0.725953,Yes,Yes,2,1,1
3,-0.034151,-0.52072,1.121336,-0.396715,1.366649,-0.794814,Medium,0.103677,0.038208,Yes,Yes,2,1,1
4,-1.186395,1.046337,-0.166631,-0.547271,0.510625,0.516132,Bad,-0.947,-0.343872,Yes,No,0,1,0


In [8]:
df.describe()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_encoded,Urban_encoded,US_encoded
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,-1.953993e-16,3.819167e-16,3.5527140000000005e-17,2.664535e-17,1.598721e-16,-6.217249e-17,1.287859e-16,-1.332268e-16,1.3075,0.705,0.645
std,1.001252,1.001252,1.001252,1.001252,1.001252,1.001252,1.001252,1.001252,0.833475,0.456614,0.479113
min,-2.657722,-3.132482,-1.705035,-0.9989392,-1.731343,-3.88188,-1.75046,-1.490113,0.0,0.0,0.0
25%,-0.7467695,-0.6513081,-0.9268889,-0.9989392,-0.8549372,-0.667948,-0.8388425,-0.7259527,1.0,0.0,0.0
50%,-0.002242444,0.001632351,0.01225357,-0.2461591,0.04864391,0.05095773,0.07277488,0.03820804,2.0,1.0,1.0
75%,0.6465597,0.6545728,0.7993444,0.807733,0.908065,0.6429978,0.7835274,0.8023688,2.0,1.0,1.0
max,3.11059,3.266335,1.836873,3.367185,1.658785,3.180312,1.648791,1.566529,2.0,1.0,1.0


In [11]:
for col in df.columns:
    if col != 'Sales' and not str(col).__contains__('encoded'):

        data = [
            go.Histogram2dContour(
                x=df['Sales'],
                y=df[col]
            )
        ]

        layout = go.Layout(title="Average Earnings for Graduates",
                xaxis=dict(title='School'),
                yaxis=dict(title='Salary (in thousands)'))

        fig = go.Figure(data=data, layout=layout)

        plotly.plotly.iplot(fig, filename='jupyter-styled_bar')


Consider using IPython.display.IFrame instead

