# **Session One Solution**

Task: Import and export data clean data.



### Step 1: Import relevant python libraries necessary for Python programming and Numpy for doing Numerical operations.

In [None]:
import pandas as pd
import numpy as np

### Step 2: Import the CSV file – NSMES1988.csv into a dataframe.

In [None]:
# Load the CSV file into a pandas dataframe and display its information.
df = pd.read_csv('NSMES1988.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4406 entries, 0 to 4405
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  4406 non-null   int64  
 1   visits      4406 non-null   int64  
 2   nvisits     4406 non-null   int64  
 3   ovisits     4406 non-null   int64  
 4   novisits    4406 non-null   int64  
 5   emergency   4406 non-null   int64  
 6   hospital    4406 non-null   int64  
 7   health      4406 non-null   object 
 8   chronic     4406 non-null   int64  
 9   adl         4406 non-null   object 
 10  region      4406 non-null   object 
 11  age         4406 non-null   float64
 12  gender      4406 non-null   object 
 13  married     4406 non-null   object 
 14  school      4406 non-null   int64  
 15  income      4406 non-null   float64
 16  employed    4406 non-null   object 
 17  insurance   4406 non-null   object 
 18  medicaid    4406 non-null   object 
dtypes: float64(2), int64(9), ob

### Step 3: Inspect the data and report the details from physical inspection – rows, columns, data types etc.

In [None]:
# Print a statistical summary of the dataframe.
print(df.describe())

        Unnamed: 0       visits      nvisits      ovisits     novisits  \
count  4406.000000  4406.000000  4406.000000  4406.000000  4406.000000   
mean   2203.500000     5.774399     1.618021     0.750794     0.536087   
std    1272.046972     6.759225     5.317056     3.652759     3.879506   
min       1.000000     0.000000     0.000000     0.000000     0.000000   
25%    1102.250000     1.000000     0.000000     0.000000     0.000000   
50%    2203.500000     4.000000     0.000000     0.000000     0.000000   
75%    3304.750000     8.000000     1.000000     0.000000     0.000000   
max    4406.000000    89.000000   104.000000   141.000000   155.000000   

         emergency     hospital      chronic          age       school  \
count  4406.000000  4406.000000  4406.000000  4406.000000  4406.000000   
mean      0.263504     0.295960     1.541988     7.402406    10.290286   
std       0.703659     0.746398     1.349632     0.633405     3.738736   
min       0.000000     0.000000     0

### Step 4: Find out if the data is clean or if the data has missing values.

In [None]:
# Check for missing values in the dataframe and print the count of missing values in each column.
missing_values = df.isnull().sum()
print(missing_values)

Unnamed: 0    0
visits        0
nvisits       0
ovisits       0
novisits      0
emergency     0
hospital      0
health        0
chronic       0
adl           0
region        0
age           0
gender        0
married       0
school        0
income        0
employed      0
insurance     0
medicaid      0
dtype: int64


### Step 5: Comment on the data types, their values, and their range specifically on age and income columns.

In [None]:
# Display a statistical summary of the 'age' and 'income' columns.
print(df['age'].describe())
print(df['income'].describe())

count    4406.000000
mean        7.402406
std         0.633405
min         6.600000
25%         6.900000
50%         7.300000
75%         7.800000
max        10.900000
Name: age, dtype: float64
count    4406.000000
mean        2.527132
std         2.924648
min        -1.012500
25%         0.912150
50%         1.698150
75%         3.172850
max        54.835100
Name: income, dtype: float64


### Step 6: Export the data to JSON as NSMES1988.json format file and view and enter your comments.

In [None]:
# Export the dataframe to a JSON file.
df.to_json('NSMES1988.json')

### Step 7: Perform memory information on the data and recommend what non-default data types you would recommend to optimize memory settings for the dataframe.

In [None]:
# Print the memory usage of the dataframe.
print(df.memory_usage(deep=True))

Index            128
Unnamed: 0     35248
visits         35248
nvisits        35248
ovisits        35248
novisits       35248
emergency      35248
hospital       35248
health        281008
chronic        35248
adl           278477
region        278036
age            35248
gender        274022
married       262360
school         35248
income         35248
employed      260409
insurance     263375
medicaid      260356
dtype: int64


### Step 8: Optimizing data types

In [None]:
# Convert the data types of 'age' and 'income' columns to optimize memory usage.
df['age'] = df['age'].astype('int8')
df['income'] = df['income'].astype('float16')

### Step 9: Create a sub-dataframe that contains only integers.

In [None]:
# Create a sub-dataframe with selected integer columns and change their data types to int8.
df2 = df[['visits', 'nvisits', 'ovisits', 'novisits', 'emergency', 'hospital', 'chronic', 'school']]
df2 = df2.astype('int8')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4406 entries, 0 to 4405
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   visits     4406 non-null   int8 
 1   nvisits    4406 non-null   int8 
 2   ovisits    4406 non-null   int8 
 3   novisits   4406 non-null   int8 
 4   emergency  4406 non-null   int8 
 5   hospital   4406 non-null   int8 
 6   chronic    4406 non-null   int8 
 7   school     4406 non-null   int8 
dtypes: int8(8)
memory usage: 34.5 KB


### Step 10: Create another sub-dataframe that contains floating point.

In [None]:
# Create a sub-dataframe with 'age' and 'income' columns and change their data types to float16.
df3 = df[['age', 'income']]
df3 = df3.astype('float16')
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4406 entries, 0 to 4405
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     4406 non-null   float16
 1   income  4406 non-null   float16
dtypes: float16(2)
memory usage: 17.3 KB


### Step 11: Export the data frame as a new CSV file NSMES1988new.csv and store it in the local space for possible use in other assignments.

In [None]:
# Export the optimized dataframe to a new CSV file and read it back.
df.to_csv('NSMES1988new.csv')
df1 = pd.read_csv('NSMES1988new.csv')

**Report**

Visual inspection of the data reveals that many int64 columns can be reduced to int8, and float columns to float16 to save memory.