## Structured Data

In [1]:
import sqlite3

In [17]:
# connect to in-memory database
conn = sqlite3.connect(':memory:')
cur = conn.cursor()

In [19]:
# create a employee table
cur.execute('''
    CREATE TABLE employee (
        id INTEGER PRIMARY KEY,
        name TEXT,
        department TEXT,
        salary REAL
    )
''')

<sqlite3.Cursor at 0x7e9d49c4f540>

In [20]:
# insert few sample data
cur.execute("INSERT INTO employee VALUES (1, 'John', 'HR', 50000)")
cur.execute("INSERT INTO employee VALUES (2, 'Jane', 'IT', 60000)")
cur.execute("INSERT INTO employee VALUES (3, 'Bob', 'Finance', 55000)")

<sqlite3.Cursor at 0x7e9d49c4f540>

In [21]:
# read the data
cur.execute("SELECT salary FROM employee where department='IT';")
rows = cur.fetchall()
for row in rows:
    print(row)

(60000.0,)


In [6]:
# update the data
cur.execute("UPDATE employee SET salary = 65000 WHERE id = 2;")


<sqlite3.Cursor at 0x7e9d8f55e0c0>

In [8]:
# delete the data
cur.execute("DELETE FROM employee WHERE id = 3;")

<sqlite3.Cursor at 0x7e9d8f55e0c0>

In [9]:
cur.execute("SELECT * FROM employee;")
rows = cur.fetchall()
for row in rows:
    print(row)

(1, 'John', 'HR', 50000.0)
(2, 'Jane', 'IT', 65000.0)


In [10]:
conn.close()

## Semi structured Data

In [11]:
import json
import pandas as pd

In [12]:
# sample data in json format
data = [
    {
        "id": 1,
        "name": "Alice",
        "contact": {
            "email": "alice@example.com",
            "phone": "123-456-7890"
        },
        "skills": ["Python", "SQL"]
    },
    {
        "id": 2,
        "name": "Bob",
        "contact": {
            "email": "bob@example.com",
            "phone": "987-654-3210"
        },
              "skills": ["Java", "AWS"]
    }
]

In [15]:
data[0]['contact']['email']

'alice@example.com'

In [23]:
df = pd.json_normalize(data)

In [24]:
df

Unnamed: 0,id,name,skills,contact.email,contact.phone
0,1,Alice,"[Python, SQL]",alice@example.com,123-456-7890
1,2,Bob,"[Java, AWS]",bob@example.com,987-654-3210


In [25]:
df[df['name']=="Bob"]

Unnamed: 0,id,name,skills,contact.email,contact.phone
1,2,Bob,"[Java, AWS]",bob@example.com,987-654-3210


In [26]:
from lxml import etree

In [27]:
xml = """
<bookstore>
  <book>
    <title>Python Basics</title>
    <price>45</price>
  </book>
  <book>
    <title>XML Essentials</title>
    <price>25</price>
  </book>
</bookstore>
"""

In [28]:
tree = etree.fromstring(xml)

In [32]:
titles = tree.xpath('/bookstore/book/title/text()')
print(titles)

['Python Basics', 'XML Essentials']


In [33]:
titles = tree.xpath('//book[price>30]/title/text()')
print(titles)

['Python Basics']


## Unstructured Data

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [4]:
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [6]:
text = """
OpenAI released a new version of GPT, and it’s changing the way we work with text data in AI applications.
"""

In [7]:
# 1.lowercase + remove punctuation
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))

In [8]:
text

'\nopenai released a new version of gpt and it’s changing the way we work with text data in ai applications\n'

In [9]:
# 2. tokenization
tokens = word_tokenize(text)

In [10]:
tokens

['openai',
 'released',
 'a',
 'new',
 'version',
 'of',
 'gpt',
 'and',
 'it',
 '’',
 's',
 'changing',
 'the',
 'way',
 'we',
 'work',
 'with',
 'text',
 'data',
 'in',
 'ai',
 'applications']

In [11]:
# 3 remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token not in stop_words]

In [12]:
filtered_tokens

['openai',
 'released',
 'new',
 'version',
 'gpt',
 '’',
 'changing',
 'way',
 'work',
 'text',
 'data',
 'ai',
 'applications']

In [13]:
import spacy

In [14]:
nlp = spacy.load("en_core_web_sm")

In [15]:
text = """
OpenAI released a new version of GPT, and it’s changing the way we work with text data in AI applications.
"""

In [16]:
doc = nlp(text)

In [19]:
# tokenization and stopword removal
tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]

In [20]:
tokens

['\n',
 'openai',
 'released',
 'new',
 'version',
 'gpt',
 'changing',
 'way',
 'work',
 'text',
 'data',
 'ai',
 'applications',
 '\n']

In [21]:
# NER
for ent in doc.ents:
    print(ent.text, ent.label_)

OpenAI PERSON
GPT ORG
AI GPE


In [22]:
import pandas as pd

df = pd.DataFrame({
    "user_id": [1, 2],
    "event": ["view_product", "add_to_cart"],
    "timestamp": ["2024-07-01T12:00:00", "2024-07-01T12:05:00"]
})
df.to_parquet("clickstream.parquet")


In [23]:
import json
import spacy


In [24]:
nlp = spacy.load("en_core_web_sm")

In [26]:
# structured csv
print("[1] Orders CSV")
orders = pd.read_csv("orders.csv")
print("Classificaiton: Structured")
orders.head()

[1] Orders CSV
Classificaiton: Structured


Unnamed: 0,order_id,customer_id,order_date,amount
0,101,2001,2024-05-01,250.75
1,102,2002,2024-05-02,140.0


In [27]:
# 2. Semi-Structured – JSON
print("\n[2] Product Catalog (JSON):")
with open("products.json") as f:
    products = json.load(f)
print(products[0])
print("🔹 Classification: Semi-Structured")
print("🔸 Suggested Tool: MongoDB / Document DB")


[2] Product Catalog (JSON):
{'product_id': 'P001', 'name': 'Wireless Mouse', 'specs': {'color': 'black', 'connectivity': 'Bluetooth'}}
🔹 Classification: Semi-Structured
🔸 Suggested Tool: MongoDB / Document DB


In [28]:
# 3. Unstructured – Reviews Text
print("\n[3] User Reviews (Text):")
with open("reviews.txt") as f:
    reviews = f.read()
doc = nlp(reviews)
entities = [(ent.text, ent.label_) for ent in doc.ents]
print("Named Entities:", entities)
print("🔹 Classification: Unstructured")
print("🔸 Suggested Tool: Elasticsearch / NLP Pipeline")


[3] User Reviews (Text):
Named Entities: []
🔹 Classification: Unstructured
🔸 Suggested Tool: Elasticsearch / NLP Pipeline


In [29]:
# 4. Clickstream – Parquet
print("\n[4] Clickstream Events (Parquet):")
clicks = pd.read_parquet("clickstream.parquet")
print(clicks.head())
print("🔹 Classification: Semi-Structured")
print("🔸 Suggested Tool: S3 + Athena / Spark / BigQuery")


[4] Clickstream Events (Parquet):
   user_id         event            timestamp
0        1  view_product  2024-07-01T12:00:00
1        2   add_to_cart  2024-07-01T12:05:00
🔹 Classification: Semi-Structured
🔸 Suggested Tool: S3 + Athena / Spark / BigQuery
