# Read Data Demo

### Demo

#### Get Database ID

1. import the sqlDataReading depot for data file:

In [1]:
import data.sqlDataFetch as sdf

2. Get database id by a valid year:

In [2]:
database_id = sdf.get_id_by_year(2024)
print("Database id: ", database_id)

Database id:  a9eb19ad-da79-4f7b-9e3b-6b13e66f8285


#### Describe Database

To describe a database and see what keys it contains (e.g., column or field names)

In [3]:
sdf.describe_database(database_id)

Column Name          | Type      
--------------------------------
_id                  | int       
PID                  | text      
CM_ID                | text      
GIS_ID               | text      
ST_NUM               | text      
ST_NAME              | text      
UNIT_NUM             | text      
CITY                 | text      
ZIP_CODE             | text      
BLDG_SEQ             | text      
NUM_BLDGS            | text      
LUC                  | text      
LU                   | text      
LU_DESC              | text      
BLDG_TYPE            | text      
OWN_OCC              | text      
OWNER                | text      
MAIL_ADDRESSEE       | text      
MAIL_STREET_ADDRESS  | text      
MAIL_CITY            | text      
MAIL_STATE           | text      
MAIL_ZIP_CODE        | text      
RES_FLOOR            | text      
CD_FLOOR             | text      
RES_UNITS            | text      
COM_UNITS            | text      
RC_UNITS             | text      
LAND_SF        

### Fetch Data

#### Fetch Data With Parameters And Conditions

1. Using fetch_data method with database id, parameters and conditions.

In [4]:
condition= "LIMIT 1000"
df1 = sdf.fetch_data(database_id, condition, _id=int, PID = str, CM_ID = str, GIS_ID = str, ST_NUM = int, ST_NAME = str, UNIT_NUM = str, CITY=str, ZIP_CODE = int, BLDG_SEQ = int, NUM_BLDGS = int, LUC = str)
df2 = sdf.fetch_data(database_id, condition, LU = str, LU_DESC = str, BLDG_TYPE = str, OWN_OCC = str, OWNER = str, MAIL_ADDRESSEE = str, MAIL_STREET_ADDRESS = str, MAIL_CITY = str, MAIL_STATE = str, MAIL_ZIP_CODE = int)
df3 = sdf.fetch_data(database_id, condition, RES_FLOOR = float, CD_FLOOR = int, RES_UNITS = int, COM_UNITS = int, RC_UNITS = int, LAND_SF = 'value', GROSS_AREA = float, LIVING_AREA = float, LAND_VALUE = 'value')
df4 = sdf.fetch_data(database_id, condition, BLDG_VALUE = 'value', SFYI_VALUE = 'value', TOTAL_VALUE = 'value', GROSS_TAX=str, YR_BUILT = int, YR_REMODEL = int, STRUCTURE_CLASS = str)
df5 = sdf.fetch_data(database_id, condition, ROOF_COVER = str, INT_WALL = str, EXT_FNISHED = str, INT_COND = str, EXT_COND = str, OVERALL_COND = str, BED_RMS = int, FULL_BTH = int, HLF_BTH = int, KITCHENS = int)
df6 = sdf.fetch_data(database_id, condition, TT_RMS = int, BDRM_COND = str, BTHRM_STYLE1 = str, BTHRM_STYLE2 = str, BTHRM_STYLE3 = str, KITCHEN_TYPE = str, KITCHEN_STYLE1 = str, KITCHEN_STYLE2 = str, KITCHEN_STYLE3 = str)
df7 = sdf.fetch_data(database_id, condition, HEAT_TYPE = str, HEAT_SYSTEM = str, AC_TYPE = str, FIREPLACES = int, ORIENTATION = str, NUM_PARKING = int, PROP_VIEW = str, CORNER_UNIT = str)
dfs = [df1, df2, df3, df4, df5, df6, df7]


for df in dfs:
    print(df)

     _id         PID       CM_ID      GIS_ID  ST_NUM        ST_NAME UNIT_NUM  \
0      1  0100001000        None  0100001000   104.0      PUTNAM ST     None   
1      2  0100002000        None  0100002000   197.0   Lexington ST     None   
2      3  0100003000        None  0100003000   199.0   Lexington ST     None   
3      4  0100004000        None  0100004000   201.0   Lexington ST     None   
4      5  0100005000        None  0100005000   203.0   Lexington ST     None   
..   ...         ...         ...         ...     ...            ...      ...   
995  960  0100817012  0100817010  0100817010   685.0    Saratoga ST        1   
996  961  0100817014  0100817010  0100817010   685.0    Saratoga ST        2   
997  976  0100835001  0100835001  0100835001   631.0    Saratoga ST     None   
998  963  0100819000        None  0100819000   691.0    Saratoga ST     None   
999  964  0100820000        None  0100820000   516.0  Bennington ST     None   

            CITY  ZIP_CODE  BLDG_SEQ  N

2. Delete Column With Too Much Null.

In [5]:
for df in dfs:

    none_count = df.isnull().sum()

    total_rows = len(df)

    cols_to_drop = none_count[none_count > total_rows / 2].index

    df.drop(cols_to_drop, axis=1, inplace=True)

    print(df)

     _id         PID      GIS_ID  ST_NUM        ST_NAME         CITY  \
0      1  0100001000  0100001000   104.0      PUTNAM ST  EAST BOSTON   
1      2  0100002000  0100002000   197.0   Lexington ST  EAST BOSTON   
2      3  0100003000  0100003000   199.0   Lexington ST  EAST BOSTON   
3      4  0100004000  0100004000   201.0   Lexington ST  EAST BOSTON   
4      5  0100005000  0100005000   203.0   Lexington ST  EAST BOSTON   
..   ...         ...         ...     ...            ...          ...   
995  960  0100817012  0100817010   685.0    Saratoga ST  EAST BOSTON   
996  961  0100817014  0100817010   685.0    Saratoga ST  EAST BOSTON   
997  976  0100835001  0100835001   631.0    Saratoga ST  EAST BOSTON   
998  963  0100819000  0100819000   691.0    Saratoga ST  EAST BOSTON   
999  964  0100820000  0100820000   516.0  Bennington ST  EAST BOSTON   

     ZIP_CODE  BLDG_SEQ  NUM_BLDGS  LUC  
0        2128         1          1  105  
1        2128         1          1  105  
2        

3. Save The DataFrame 'df' As A CSV File To Folder

In [6]:
for i in range(len(dfs)):
    csv_file_path = "./preliminarydata/506" + str(i) + ".csv"  

    dfs[i].to_csv(csv_file_path, index=False)
