# Goal

Teach all practical ways of loading data in ML workflows — local, remote, structured, unstructured.

# Data Loading in Machine Learning

## Objective
Learn how to load data from multiple sources reliably and efficiently for ML workflows.


In [1]:
import pandas as pd
import numpy as np


# If you have any CSV file then choose this 

In [2]:
df_csv = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
df_csv.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# If you have any Excel file then choose this 

In [3]:
# df_excel = pd.read_excel("data.xlsx", sheet_name="Sheet1")


# Loading JSON

In [6]:
import requests

resp = requests.get("https://api.github.com/repos/pandas-dev/pandas")
resp.raise_for_status()
json_data = pd.json_normalize(resp.json())
json_data.head()


Unnamed: 0,id,node_id,name,full_name,private,html_url,description,fork,url,forks_url,...,organization.gists_url,organization.starred_url,organization.subscriptions_url,organization.organizations_url,organization.repos_url,organization.events_url,organization.received_events_url,organization.type,organization.user_view_type,organization.site_admin
0,858127,MDEwOlJlcG9zaXRvcnk4NTgxMjc=,pandas,pandas-dev/pandas,False,https://github.com/pandas-dev/pandas,Flexible and powerful data analysis / manipula...,False,https://api.github.com/repos/pandas-dev/pandas,https://api.github.com/repos/pandas-dev/pandas...,...,https://api.github.com/users/pandas-dev/gists{...,https://api.github.com/users/pandas-dev/starre...,https://api.github.com/users/pandas-dev/subscr...,https://api.github.com/users/pandas-dev/orgs,https://api.github.com/users/pandas-dev/repos,https://api.github.com/users/pandas-dev/events...,https://api.github.com/users/pandas-dev/receiv...,Organization,public,False


# Load from SQL (Conceptual)

In [8]:
# Load from SQL (Conceptual)
# from sqlalchemy import create_engine
# engine = create_engine("sqlite:///database.db")
# df_sql = pd.read_sql("SELECT * FROM table", engine)


In [9]:
# Load Large Files (Chunks)
chunk_iter = pd.read_csv(
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv",
    chunksize=200
)

for chunk in chunk_iter:
    print(chunk.shape)
    break


(200, 12)


# Key takeaways

- Data can come from many sources
- Loading strategy depends on size and structure
- Efficient loading saves memory and time
