# SAS with Python - saspy module

## Step 0 : Environment Setup

In [2]:
# To get multiple outputs in the same cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [4]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
# Set the required global options

# To display all the columns in dataframe
pd.set_option( "display.max_columns", None)
pd.set_option( "display.max_rows", None)

## Step 1 : Configure SAS Session

- Start SAS Session
- Enter Login Credentials

In [6]:
import saspy
sas = saspy.SASsession(java='C:\\Program Files\\Java\\jdk-15.0.1\\bin\\java.exe', iomhost=['odaws01-apse1.oda.sas.com','odaws02-apse1.oda.sas.com'], iomport=8591, encoding='utf-8')
sas
# abhi0311sharma0
# SASthepower2KNOW@

Using SAS Config named: default
Please enter the IOM user id: abhi0311sharma0
Please enter the password for IOM user : ········
SAS Connection established. Subprocess id is 22728



Access Method         = IOM
SAS Config name       = default
SAS Config file       = C:\Users\abhi0\anaconda3\lib\site-packages\saspy\sascfg.py
WORK Path             = /saswork/SAS_work0D0F000176FE_odaws01-apse1.oda.sas.com/SAS_workB516000176FE_odaws01-apse1.oda.sas.com/
SAS Version           = 9.04.01M6P11072018
SASPy Version         = 3.6.4
Teach me SAS          = False
Batch                 = False
Results               = Pandas
SAS Session Encoding  = utf-8
Python Encoding value = utf-8
SAS process Pid value = 95998


## Step 2 : Run SAS Procedure

In [8]:
%%SAS sas
proc print data=sashelp.cars (obs=4);
run;

Obs,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
1,Acura,MDX,SUV,Asia,All,"$36,945","$33,337",3.5,6,265,17,23,4451,106,189
2,Acura,RSX Type S 2dr,Sedan,Asia,Front,"$23,820","$21,761",2.0,4,200,24,31,2778,101,172
3,Acura,TSX 4dr,Sedan,Asia,Front,"$26,990","$24,647",2.4,4,200,22,29,3230,105,183
4,Acura,TL 4dr,Sedan,Asia,Front,"$33,195","$30,299",3.2,6,270,20,28,3575,108,186


In [9]:
sc = "proc print data=sashelp.cars (obs=5); run;"
scp = sas.submitLST(sc, method='listorlog')

Obs,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
1,Acura,MDX,SUV,Asia,All,"$36,945","$33,337",3.5,6,265,17,23,4451,106,189
2,Acura,RSX Type S 2dr,Sedan,Asia,Front,"$23,820","$21,761",2.0,4,200,24,31,2778,101,172
3,Acura,TSX 4dr,Sedan,Asia,Front,"$26,990","$24,647",2.4,4,200,22,29,3230,105,183
4,Acura,TL 4dr,Sedan,Asia,Front,"$33,195","$30,299",3.2,6,270,20,28,3575,108,186
5,Acura,3.5 RL 4dr,Sedan,Asia,Front,"$43,755","$39,014",3.5,6,225,18,24,3880,115,197


In [10]:
sc = "proc sql; create table work.dict_tables as select * from dictionary.tables; quit;"
scp = sas.submitLST(sc, method='listorlog')

In [11]:
sc = "proc print data=work.dict_tables (obs=5); run;"
scp = sas.submitLST(sc, method='listorlog')

Obs,libname,memname,memtype,dbms_memtype,memlabel,typemem,crdate,modate,nobs,obslen,nvar,protect,compress,encrypt,npage,filesize,pcompress,reuse,bufsize,delobs,nlobs,maxvar,maxlabel,maxgen,gen,attr,indxtype,datarep,sortname,sorttype,sortchar,reqvector,datarepname,encoding,audit,audit_before,audit_admin,audit_error,audit_data,num_character,num_numeric
1,WORK,__JUPYTERSASKERNEL__,DATA,,,DATA,08JAN21:15:21:16,08JAN21:15:21:16,300,150,2,---,NO,NO,1,262144,0,no,131072,0,300,8,0,0,.,ON,,NATIVE,,,,181F10113322003333010231013301233300141400200301,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64",utf-8 Unicode (UTF-8),no,no,no,no,no,2,0
2,MAPS,AFGHANI2,DATA,,AFGHANISTAN: Copyright(C) 1998 SAS Institute Inc.,SFT,28MAY15:01:19:54,28MAY15:01:19:54,29,72,4,---,NO,NO,1,131072,0,no,65536,0,29,14,37,0,.,ON,,NATIVE,,,,181F101133220033330102310133012333001C0000200301,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64",us-ascii ASCII (ANSI),no,no,no,no,no,2,2
3,MAPS,AFGHANIS,DATA,,AFGHANISTAN: Copyright(C) SAS Institute Inc. - modified Jan2005,DATA,28MAY15:01:19:54,28MAY15:01:19:54,2644,34,6,---,NO,NO,2,196608,0,no,65536,0,2644,7,36,0,.,ON,,NATIVE,,,,181F101133220033330102310133012333001C0000200301,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64",us-ascii ASCII (ANSI),no,no,no,no,no,0,6
4,MAPS,AFRICA,DATA,,AFRICA: Source - Derived from CIA 2001 World Shapefile - Modified May 2008,DATA,28MAY15:01:19:54,28MAY15:01:19:54,52824,56,8,---,NO,NO,46,3080192,0,no,65536,0,52824,7,32,0,.,ON,,NATIVE,,S,ANSI,181F101133220033330102310133012333001C0000200301,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64",us-ascii ASCII (ANSI),no,no,no,no,no,0,8
5,MAPS,AFRICA2,DATA,,AFRICA-Country Names:GEOnet Names Server/CIA mapdata - modified May 2008 SAS Institute Inc.,SFT,28MAY15:01:15:01,28MAY15:01:15:01,61,202,10,---,NO,NO,1,131072,0,no,65536,0,61,10,44,0,.,ON,,NATIVE,,S,ANSI,181F101133220033330102310133012333001C0000200301,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64",us-ascii ASCII (ANSI),no,no,no,no,no,4,6


## Step 3 : Transfer Data between Pandas Dataframe and SAS

- _Function **df2sd** converts pandas dataframe to sas dataset_.

In [12]:
pandasdf = pd.read_csv("./heart.csv")
type(pandasdf)
sasdf = sas.df2sd(pandasdf, 'sasdf')
type(sasdf)
sas.submitLST("proc print data=work.sasdf (obs=3);run;", method='listorlog')

pandas.core.frame.DataFrame

saspy.sasdata.SASdata

Obs,age,sex,BP,cholestrol,heart disease
1,70,1,130,322,1
2,67,0,115,564,0
3,57,1,124,261,1


- _Function **sd2df** converts sas dataset to pandas dataframe._

In [13]:
pandasdf2 = sas.sd2df(sasdf.table)
type(pandasdf2)
pandasdf2.head()

pandas.core.frame.DataFrame

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0


### Creating a saspy.sasdata.SASdata Object

In [42]:
cars = sas.sasdata('cars', 'sashelp')
type(cars)
cars.head()

saspy.sasdata.SASdata

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,36945,33337,3.5,6,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,23820,21761,2.0,4,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,26990,24647,2.4,4,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,33195,30299,3.2,6,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,43755,39014,3.5,6,225,18,24,3880,115,197


In [43]:
dict_tables = sas.sasdata('vtable', 'sashelp')
type(dict_tables)
dict_tables.head(3)

saspy.sasdata.SASdata

Unnamed: 0,libname,memname,memtype,dbms_memtype,memlabel,typemem,crdate,modate,nobs,obslen,nvar,protect,compress,encrypt,npage,filesize,pcompress,reuse,bufsize,delobs,nlobs,maxvar,maxlabel,maxgen,gen,attr,indxtype,datarep,sortname,sorttype,sortchar,reqvector,datarepname,encoding,audit,audit_before,audit_admin,audit_error,audit_data,num_character,num_numeric
0,WORK,DICT_TABLES,DATA,,,DATA,2021-01-08 13:52:26.528801,2021-01-08 13:52:26.528801,1304.0,1032,41,---,NO,NO,11,1572864,0,no,131072,0,1304.0,13,31,0,,ON,,NATIVE,,,,181F10113322003333202031203320233300141400200320,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LIN...",utf-8 Unicode (UTF-8),no,no,no,no,no,24,17
1,WORK,SASDATA2DATAFRAME,VIEW,,,VIEW,2021-01-08 14:16:40.455368,2021-01-08 14:16:40.455368,,152,15,---,NO,NO,0,0,0,no,0,0,,11,0,0,,O,,NATIVE,,,,000000000000000000000000000000000000000000000000,,Default,no,no,no,no,no,5,10
2,WORK,SASDF,DATA,,,DATA,2021-01-08 14:14:53.157033,2021-01-08 14:14:53.157033,270.0,40,5,---,NO,NO,1,262144,0,no,131072,0,270.0,13,0,0,,ON,,NATIVE,,,,181F10113322003333202031203320233300141400200320,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LIN...",utf-8 Unicode (UTF-8),no,no,no,no,no,0,5


In [47]:
x = dict_tables
type(x)
x.head(2)

saspy.sasdata.SASdata

Unnamed: 0,libname,memname,memtype,dbms_memtype,memlabel,typemem,crdate,modate,nobs,obslen,nvar,protect,compress,encrypt,npage,filesize,pcompress,reuse,bufsize,delobs,nlobs,maxvar,maxlabel,maxgen,gen,attr,indxtype,datarep,sortname,sorttype,sortchar,reqvector,datarepname,encoding,audit,audit_before,audit_admin,audit_error,audit_data,num_character,num_numeric
0,WORK,DICT_TABLES,DATA,,,DATA,2021-01-08 13:52:26.528801,2021-01-08 13:52:26.528801,1304.0,1032,41,---,NO,NO,11,1572864,0,no,131072,0,1304.0,13,31,0,,ON,,NATIVE,,,,181F10113322003333202031203320233300141400200320,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LIN...",utf-8 Unicode (UTF-8),no,no,no,no,no,24,17
1,WORK,SASDATA2DATAFRAME,VIEW,,,VIEW,2021-01-08 14:17:27.564131,2021-01-08 14:17:27.564131,,1031,41,---,NO,NO,0,0,0,no,0,0,,13,0,0,,O,,NATIVE,,,,000000000000000000000000000000000000000000000000,,Default,no,no,no,no,no,24,17


### SAS DS to Pandas DF - Method 1
- Creating a dataset in sas
- Converting it to Pandas Dataframe using **sd2df**

In [58]:
sas.submitLST("data dict_tables; set sashelp.vtable; run;", method='listonly') # method='listorlog'
pandasdf2 = sas.sd2df(dict_tables.table)
type(pandasdf2)
pandasdf2.head(2)

pandas.core.frame.DataFrame

Unnamed: 0,libname,memname,memtype,dbms_memtype,memlabel,typemem,crdate,modate,nobs,obslen,nvar,protect,compress,encrypt,npage,filesize,pcompress,reuse,bufsize,delobs,nlobs,maxvar,maxlabel,maxgen,gen,attr,indxtype,datarep,sortname,sorttype,sortchar,reqvector,datarepname,encoding,audit,audit_before,audit_admin,audit_error,audit_data,num_character,num_numeric
0,WORK,DICT_TABLES,DATA,,,DATA,2021-01-08 14:35:35.493949,2021-01-08 14:35:35.493949,1304.0,1032,41,---,NO,NO,11,1572864,0,no,131072,0,1304.0,13,31,0,,ON,,NATIVE,,,,181F10113322003333202031203320233300141400200320,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LIN...",utf-8 Unicode (UTF-8),no,no,no,no,no,24,17
1,WORK,SASDATA2DATAFRAME,VIEW,,,VIEW,2021-01-08 14:35:39.483875,2021-01-08 14:35:39.483875,,1031,41,---,NO,NO,0,0,0,no,0,0,,13,0,0,,O,,NATIVE,,,,000000000000000000000000000000000000000000000000,,Default,no,no,no,no,no,24,17


### SAS DS to Pandas DF - Method 2

- Creating a SAS Data Object
- Using SAS Data Object attribute **SAS_Data_Obj.to_df()** to convert to Pandas DataFrame Object
- [sas-data-object](https://sassoftware.github.io/saspy/api.html#sas-data-object)

In [49]:
dict_tables = sas.sasdata('dict_tables', 'work')
type(dict_tables)
dict_tables.head(3)

saspy.sasdata.SASdata

Unnamed: 0,libname,memname,memtype,dbms_memtype,memlabel,typemem,crdate,modate,nobs,obslen,nvar,protect,compress,encrypt,npage,filesize,pcompress,reuse,bufsize,delobs,nlobs,maxvar,maxlabel,maxgen,gen,attr,indxtype,datarep,sortname,sorttype,sortchar,reqvector,datarepname,encoding,audit,audit_before,audit_admin,audit_error,audit_data,num_character,num_numeric
0,WORK,DICT_TABLES,DATA,,,DATA,2021-01-08 13:52:26.528801,2021-01-08 13:52:26.528801,1304.0,1032,41,---,NO,NO,11,1572864,0,no,131072,0,1304.0,13,31,0,,ON,,NATIVE,,,,181F10113322003333202031203320233300141400200320,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LIN...",utf-8 Unicode (UTF-8),no,no,no,no,no,24,17
1,WORK,SASDATA2DATAFRAME,VIEW,,,VIEW,2021-01-08 14:18:59.737185,2021-01-08 14:18:59.737185,,1031,41,---,NO,NO,0,0,0,no,0,0,,13,0,0,,O,,NATIVE,,,,000000000000000000000000000000000000000000000000,,Default,no,no,no,no,no,24,17
2,WORK,SASDF,DATA,,,DATA,2021-01-08 14:14:53.157033,2021-01-08 14:14:53.157033,270.0,40,5,---,NO,NO,1,262144,0,no,131072,0,270.0,13,0,0,,ON,,NATIVE,,,,181F10113322003333202031203320233300141400200320,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LIN...",utf-8 Unicode (UTF-8),no,no,no,no,no,0,5


In [55]:
s = dict_tables.to_df()
type(s)
s.tail(2)

pandas.core.frame.DataFrame

Unnamed: 0,libname,memname,memtype,dbms_memtype,memlabel,typemem,crdate,modate,nobs,obslen,nvar,protect,compress,encrypt,npage,filesize,pcompress,reuse,bufsize,delobs,nlobs,maxvar,maxlabel,maxgen,gen,attr,indxtype,datarep,sortname,sorttype,sortchar,reqvector,datarepname,encoding,audit,audit_before,audit_admin,audit_error,audit_data,num_character,num_numeric
1302,MAPSGFK,ZIMBABWE,DATA,,ZIMBABWE - Source: GfkGeoMarketing - 2014,GFK,2018-10-25 02:27:52.794136,2018-10-25 02:27:52.794136,28030.0,80,9,---,NO,NO,35,2359296,0,no,65536,0,28030.0,10,43,0,,ON,,NATIVE,,S,ANSI,181F101133220033332020312033202333001C0000200320,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LIN...",us-ascii ASCII (ANSI),no,no,no,no,no,2,7
1303,MAPSGFK,ZIMBABWE_ATTR,DATA,,ZIMBABWE - Source: GfkGeoMarketing - 2014,GFK,2018-10-25 02:27:45.941241,2018-10-25 02:27:45.941241,59.0,218,9,---,NO,NO,1,131072,0,no,65536,0,59.0,9,34,0,,ON,,NATIVE,,S,ANSI,181F101133220033332020312033202333001C0000200320,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LIN...",us-ascii ASCII (ANSI),no,no,no,no,no,8,1


In [61]:
#s.T

#### Re-confirming Method 1 : A sas dataset created in sas. Then converted to Python Dataframe using sd2df

In [62]:
sc = "proc sql; create table work.dict_tables as select * from dictionary.tables; quit;"
scp = sas.submitLST(sc, method='listonly')

In [63]:
dict_table_sql = sas.sd2df(dict_tables.table)
type(dict_table_sql)
dict_table_sql.head(2)

pandas.core.frame.DataFrame

Unnamed: 0,libname,memname,memtype,dbms_memtype,memlabel,typemem,crdate,modate,nobs,obslen,nvar,protect,compress,encrypt,npage,filesize,pcompress,reuse,bufsize,delobs,nlobs,maxvar,maxlabel,maxgen,gen,attr,indxtype,datarep,sortname,sorttype,sortchar,reqvector,datarepname,encoding,audit,audit_before,audit_admin,audit_error,audit_data,num_character,num_numeric
0,WORK,DICT_TABLES,DATA,,,DATA,2021-01-08 14:35:45.989680,2021-01-08 14:35:45.989680,1304.0,1032,41,---,NO,NO,11,1572864,0,no,131072,0,1304.0,13,31,0,,ON,,NATIVE,,,,181F10113322003333202031203320233300141400200320,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LIN...",utf-8 Unicode (UTF-8),no,no,no,no,no,24,17
1,WORK,SASDATA2DATAFRAME,VIEW,,,VIEW,2021-01-08 14:35:50.024022,2021-01-08 14:35:50.024022,,1031,41,---,NO,NO,0,0,0,no,0,0,,13,0,0,,O,,NATIVE,,,,000000000000000000000000000000000000000000000000,,Default,no,no,no,no,no,24,17


## 2. Reading the Input data (csv) file

In [5]:
 heart = pd.read_csv('./heart.csv')

In [6]:
heart.head()

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0


## 3. Data Analysis & Cleaning

In [7]:
# Checking rows and columns - shape 
heart.shape

(270, 5)

In [8]:
# Getting the overview of Data types and Non-Null info
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   age            270 non-null    int64
 1   sex            270 non-null    int64
 2   BP             270 non-null    int64
 3   cholestrol     270 non-null    int64
 4   heart disease  270 non-null    int64
dtypes: int64(5)
memory usage: 10.7 KB


### Checking Missing Values

In [9]:
# Checking for any Null columns
heart.isnull().sum().any()

heart.shape[0]

# Finding the columns with more than 40% NULLs.
ser = heart.isnull().sum()/len(heart)*100
null_drps = ser[ser > 40]
null_drps

False

270

Series([], dtype: float64)

In [12]:
# Checking the info of the remaining columns with NULLs
heart[nulls.index].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Empty DataFrame