# The smart data analysis assistant 

#### File system and data handling

In [26]:
import pathlib as path

In [27]:
data_dir = path.Path("datasets")
if not data_dir.exists:
    data_dir.mkdir(parents=True, exist_ok=True)
else :
    print("Path {} already exists".format(data_dir))

Path datasets already exists


##### Validate input. Returns boolean

In [28]:


# read in excel or csv file
def validate_file(source):
    if path.Path(source).is_file():
        if path.Path(source).suffix in [".csv",".xlsx"]:
            print("file is valid")
            return True
        else:
            print("unaccepted file format")
            print(path.Path(source).suffix)
            return False
    else:
        print("file is invalid")
        return False

##### read input. Returns tuple (dataframe/None,source)

In [29]:
#read dataset if valid
import pandas as pd
def read_input(source):
    is_valid = validate_file(source)
    if is_valid:
        if path.Path(source).suffix == ".csv":
            data = pd.read_csv(source)
            return (data,source)

        elif path.Path(source).suffix == ".xlsx":
            data = pd.read_excel(source)
            return (data,source)
    return (None,source)

##### Test read_input function

In [30]:
#read invalid file
res = read_input("")
if res[0] is None:
    print("Provide file ' {} ' in ivalid".format(res[1]))


file is invalid
Provide file '  ' in ivalid


In [31]:
#read valid file
res = read_input("../Datasets/pima-indians-diabetes.csv")
print("file '{}' is valid".format(res[1]))

file is valid
file '../Datasets/pima-indians-diabetes.csv' is valid


##### Grab dataframe as first item in tuple

In [32]:
data = res[0]

##### Trying to describe data

In [64]:
def describe_data(data):
    if len(data) > 0:
        # Take a peep at the data
        print("Take a peep ...")
        print(data.head())

        print("Datatypes in dataset ...")
        #Get data types
        print(data.dtypes)

        print("Columns in dataset ...")
        #Get columns
        print(data.columns)

        print("Shape of dataset ...")
        #Get shape of data
        shape = data.shape
        print("Dataset has {} rows and {} columns".format(shape[0],shape[1]))

        print("Describe dataset ...")
        #Descriptive summary
        print(data.describe())

In [65]:
# Lets describe the dataset 
describe_data(data)

Take a peep ...
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Datatypes in dataset ...
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age       

In [59]:
# Get numeric fields
numeric_data = data.select_dtypes(exclude=['O','bool'])


In [60]:
describe_data(numeric_data)

Take a peep ...
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Datatypes in dataset ...
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age       

In [61]:
# Describe numeric data
#numeric_data.describe()

##### Function to get null value statistics from dataset

In [39]:
#What fields have  null values

def get_null_stats(data):
    
    #What is the percentage of these null values 
    null_fields = get_null_fields(data)[0]
    null_fields_names = get_null_fields(data)[1]

    for col in null_fields_names:
        null_fields[col] = null_fields[col]*100/len(data[col])

    return null_fields


def get_null_fields(data):
    null_stats = data.isnull().sum()
    null_fields = null_stats[null_stats>0]
    null_fields
    #What is the percentage of these null values 
    null_fields_names = null_fields.index

    return (null_fields,null_fields_names)


##### Get summary and next step suggestions on any null values

In [40]:

def print_summary(null_stats):
    nulls_exist = len(null_stats) > 0
    if nulls_exist:
        print("dataset has null numeric values")
        print("........")
        print("summary")
        print("...................")
        print(null_stats)
        print("...................")
        print("...................")
        print("""Possible actions to take :
            1. Drop rows with any missing values
            2. Drop columns with miissing values
            3. Inpute missing data
            """)
    else:
        print("dataset has no null values")

##### Use functions to get null statistics

In [41]:
# Get null stats
numeric_data_null_stats = get_null_stats(numeric_data)

# Print summary based on null stats
print_summary(numeric_data_null_stats)

dataset has no null values


##### Define null remover

In [42]:
def imputer(columns):
    print('impute missing data in colums {}'.format(columns))

In [43]:
def null_remover(decision,df,columns=[]):
    if decision == "drop_rows":
       cleaned = df.dropna(axis=0)
       
       print("Dropped {} rows from dataset".format(len(df)-len(cleaned)))
       print("...................")

    elif decision == "drop_cols":
       if len(columns) == 0:
           print("Please provide a nonempty column list")
           print("...................")

       cleaned = df.drop(columns,axis=1)
       print("Dropped columns {} from dataset".format(columns))
       print("...................")


    elif decision == "inpute":
        imputer(columns)
    return cleaned


##### Use null remover

In [44]:
# now lets deal with null values for numeric data
cleaned = null_remover("drop_cols",numeric_data,get_null_fields(numeric_data)[1])
print(cleaned.isnull().any())
cleaned.shape


Please provide a nonempty column list
...................
Dropped columns Index([], dtype='object') from dataset
...................
Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool


(768, 9)

In [45]:
# now lets deal with null values
cleaned = null_remover("drop_rows",numeric_data)
print(cleaned.isnull().any())
cleaned.shape


Dropped 0 rows from dataset
...................
Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool


(768, 9)

In [46]:
categorical_data = data.select_dtypes(exclude=['int64','float64'])
categorical_data.head()

0
1
2
3
4


# Describe categorical data

In [47]:
categorical_data.describe()

ValueError: Cannot describe a DataFrame without columns

In [None]:
# Get null stats for categorical data
categorical_data_null_stats = get_null_stats(categorical_data)

print_summary(categorical_data_null_stats)


dataset has no null values


In [None]:
# Get null stats for whole dataset

data_null_stats = get_null_stats(data)

print_summary(data_null_stats)

dataset has no null values
