<a href="https://colab.research.google.com/github/Bibhu7377/Bibhuxclnc/blob/main/Copy_of_Handling_Diff_DT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Handling Structured Data
import pandas as pd

# Example: Reading a CSV file . download from kaggle
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [10, 20, 55],
    'City': ['New York', 'Los Angeles', 'Chicago']
})
print("Structured Data:\n", data)

# Filtering data
print("\nFiltered Data (Age > 28):\n", data[data['Age'] > 28])


Structured Data:
       Name  Age         City
0    Alice   10     New York
1      Bob   20  Los Angeles
2  Charlie   55      Chicago

Filtered Data (Age > 28):
       Name  Age     City
2  Charlie   55  Chicago


In [2]:
# Handling Unstructured Data
# Example: Reading a text file
text_data = """This is an example of unstructured data.
It can be a document, a social media post, or even a raw log."""
print("Unstructured Data:\n", text_data)

# Word count as a simple analysis
word_count = len(text_data.split())
print("\nWord Count:", word_count)


Unstructured Data:
 This is an example of unstructured data.
It can be a document, a social media post, or even a raw log.

Word Count: 21


In [4]:
# Handling Semi-Structured Data
import json

# Example: JSON data
json_data = '{"Name": "Alice", "Age": 40, "Skills": ["Python", "Data Analysis"]}'

# Parsing JSON
parsed_data = json.loads(json_data)
print("Semi-Structured Data:\n", parsed_data)

# Accessing specific elements
print("\nName:", parsed_data["Name"])
print("Skills:", ", ".join(parsed_data["Skills"]))


Semi-Structured Data:
 {'Name': 'Alice', 'Age': 40, 'Skills': ['Python', 'Data Analysis']}

Name: Alice
Skills: Python, Data Analysis


In [None]:
# Collecting Data Using an API. API are provided by owner of data.
import requests

# Example: Fetching data from a public API
url = "https://api.open-meteo.com/v1/forecast?latitude=35&longitude=139&hourly=temperature_2m"
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    print("Collected Data (API):\n", data['hourly']['temperature_2m'][:10])  # Print first 5 values
else:
    print("Failed to fetch data")


Collected Data (API):
 [8.7, 9.2, 9.4, 10.4, 10.3, 9.8, 8.9, 7.9, 6.7, 6.1]


In [None]:
# Example: Simple Data Processing
import pandas as pd
import numpy as np

# Load stored data
df = pd.read_csv("/content/customer_shopping_data.csv")

# Cleaning: Handle missing values
df['age'] = df['age'].fillna(df['age'].mean())

# Transformation: Add a new column
df['age Group'] = np.where(df['age'] < 30, 'Young', 'Adult')

# Analysis: Group by city
#city_group = df.groupby('city')['age'].mean()
#print("Average Age by City:\n", city_group)


In [None]:
df

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall,age Group
0,I138884,C241288,Female,28.0,Clothing,5,1500.40,Credit Card,05-08-2022,Kanyon,Young
1,I317333,C111565,Male,21.0,Shoes,3,1800.51,Debit Card,12-12-2021,Forum Istanbul,Young
2,I127801,C266599,Male,20.0,Clothing,1,300.08,Cash,09-11-2021,Metrocity,Young
3,I173702,C988172,Female,66.0,Shoes,5,3000.85,Credit Card,16-05-2021,Metropol AVM,Adult
4,I337046,C189076,Female,53.0,Books,4,60.60,Cash,24-10-2021,Kanyon,Adult
...,...,...,...,...,...,...,...,...,...,...,...
99452,I219422,C441542,Female,45.0,Souvenir,5,58.65,Credit Card,21-09-2022,Kanyon,Adult
99453,I325143,C569580,Male,27.0,Food & Beverage,2,10.46,Cash,22-09-2021,Forum Istanbul,Young
99454,I824010,C103292,Male,63.0,Food & Beverage,2,10.46,Debit Card,28-03-2021,Metrocity,Adult
99455,I702964,C800631,Male,56.0,Technology,4,4200.00,Cash,16-03-2021,Istinye Park,Adult


In [None]:
import pandas as pd

In [None]:
json_data = '{"name": ["John","kiran"], "age": [30, 75], "city": ["New York", "Bengaluru"]}'

In [None]:
print(json_data)

{"name": ["John","kiran"], "age": [30, 75], "city": ["New York", "Bengaluru"]}


In [None]:
df = pd.read_json(json_data)

  df = pd.read_json(json_data)


In [None]:
df

Unnamed: 0,name,age,city
0,John,30,New York
1,kiran,75,Bengaluru


In [None]:
df1 = pd.read_json('/content/Sample_Json.json')

In [None]:
df1.head()

Unnamed: 0,name,email,address,phone,website
0,Dorcas Nienow,waelchi.betty@rodriguez.org,"68520 Jordi Lakes\nPort Willowchester, OK 72555",1-857-654-1614,https://examplefile.com
1,Matilda Mayer DDS,khalid31@gmail.com,4667 Dickinson Crescent Suite 943\nPort Ansley...,861.536.2755 x6481,https://examplefile.com
2,Dr. Osbaldo Daugherty,percy.brakus@gmail.com,"33244 Kurtis Fields\nSouth Ana, NC 85943-0622",(996) 848-4549 x718,https://examplefile.com
3,Laurianne Toy,miller.lavina@yahoo.com,"18953 Jast Views\nWest Idell, MT 49914",(483) 471-0047,https://examplefile.com
4,Carmen Herzog III,jazmyne.harris@ebert.biz,"50688 Hilda Spur Suite 143\nLilianaland, MA 02...",(436) 215-7804,https://examplefile.com


Handling Nested JSON Data

In [None]:
import pandas as pd

In [None]:
import json
from pandas import json_normalize

In [None]:
# Sample JSON structure with nested data
data = {
    "employees": [
        {
            "id": 1,
            "name": "Alice",
            "department": {"name": "Engineering", "location": "Building A"},
            "projects": [
                {"name": "Project X", "duration_months": 6},
                {"name": "Project Y", "duration_months": 4}
            ]
        },
        {
            "id": 2,
            "name": "Bob",
            "department": {"name": "HR", "location": "Building B"},
            "projects": [
                {"name": "Project Z", "duration_months": 12}
            ]
        }
    ]
}

In [None]:
data

{'employees': [{'id': 1,
   'name': 'Alice',
   'department': {'name': 'Engineering', 'location': 'Building A'},
   'projects': [{'name': 'Project X', 'duration_months': 6},
    {'name': 'Project Y', 'duration_months': 4}]},
  {'id': 2,
   'name': 'Bob',
   'department': {'name': 'HR', 'location': 'Building B'},
   'projects': [{'name': 'Project Z', 'duration_months': 12}]}]}

In [None]:
df = json_normalize(data['employees'],
                    record_path = 'projects',
                    meta = ['id', ['department', 'name'], ['department', 'location']],
                    record_prefix='project_',
                    meta_prefix='employee_')

In [None]:
df.head()

Unnamed: 0,project_name,project_duration_months,employee_id,employee_department.name,employee_department.location
0,Project X,6,1,Engineering,Building A
1,Project Y,4,1,Engineering,Building A
2,Project Z,12,2,HR,Building B


XML

In [None]:
import pandas as pd

In [None]:
import xml.etree.ElementTree as ET

In [None]:
# Step 1: Create sample XML data and save it to a file
root = ET.Element("employees")

# Adding first employee
employee1 = ET.SubElement(root, "employee")
ET.SubElement(employee1, "id").text = "1"
ET.SubElement(employee1, "name").text = "Alice"
department1 = ET.SubElement(employee1, "department")
ET.SubElement(department1, "name").text = "Engineering"
ET.SubElement(department1, "location").text = "Building A"

# Adding second employee
employee2 = ET.SubElement(root, "employee")
ET.SubElement(employee2, "id").text = "2"
ET.SubElement(employee2, "name").text = "Bob"
department2 = ET.SubElement(employee2, "department")
ET.SubElement(department2, "name").text = "HR"
ET.SubElement(department2, "location").text = "Building B"

In [None]:
tree = ET.ElementTree(root)
with open("employees.xml", "wb") as f:
  tree.write(f)

In [None]:
with open("employees.xml", "rb") as f:
  data = f.read()
  print(data)

b'<employees><employee><id>1</id><name>Alice</name><department><name>Engineering</name><location>Building A</location></department></employee><employee><id>2</id><name>Bob</name><department><name>HR</name><location>Building B</location></department></employee></employees>'


In [None]:
df = pd.read_xml("employees.xml")

In [None]:
df.head()

Unnamed: 0,id,name,department
0,1,Alice,
1,2,Bob,


In [None]:
df = pd.read_xml('/content/XML_Sample.xml')

In [None]:
df.head()

Unnamed: 0,name,age,email,title,author,year
0,John Doe,30.0,john.doe@example.com,,,
1,Jane Smith,25.0,jane.smith@example.com,,,
2,,,,The Adventure Begins,Robert Johnson,2022.0
3,John Doe,30.0,john.doe@example.com,,,
4,Jane Smith,25.0,jane.smith@example.com,,,


#Un-Structured Data

Handle the Images

In [None]:
import cv2
import pandas as pd

In [None]:
def extract_features(image):
  hog = cv2.HOGDescriptor()
  features = hog.compute(image)
  updated_features = features.flatten()
  return updated_features

In [None]:
image_path = "/content/kgf.jpg"
image_title = "Rocky"
image = cv2.imread(image_path)

In [None]:
image = cv2.resize(image, (64, 128))

In [None]:
extract = extract_features(image)

In [None]:
extract

array([0.0277736 , 0.00101991, 0.00140209, ..., 0.10126806, 0.14180572,
       0.18403524], dtype=float32)

In [None]:
df = pd.DataFrame([extract])
df.columns = [f'feature_{i}' for i in range(df.shape[1])]
df['image_title'] = image_title




In [None]:
df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_3771,feature_3772,feature_3773,feature_3774,feature_3775,feature_3776,feature_3777,feature_3778,feature_3779,image_title
0,0.027774,0.00102,0.001402,0.0,0.010406,0.048951,0.112874,0.253651,0.211187,0.148199,...,0.219052,0.123067,0.134302,0.323953,0.323953,0.169584,0.101268,0.141806,0.184035,Rocky


Working with multiple images

In [None]:

image_data = [
    ("/content/th (1).jpg", "Rose"),
    ("/content/th (3).jpg", "Sunflower"),
    ("/content/th (4).jpg", "Lotus")
]

all_feature_names = []

for image_path, title in image_data:
  image = cv2.imread(image_path)
  if image is not None:
    image = cv2.resize(image, (64, 128))
    features = extract_features(image)
    features_with_title = list(features) + [title]
    all_feature_names.append(features_with_title)

df = pd.DataFrame(all_feature_names)
df.columns = [f'feature_{i}' for i in range(df.shape[1] - 1)] + ['image_title']

In [None]:
df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_3771,feature_3772,feature_3773,feature_3774,feature_3775,feature_3776,feature_3777,feature_3778,feature_3779,image_title
0,0.118218,0.259309,0.259309,0.254985,0.162486,0.23482,0.191519,0.244178,0.223024,0.159787,...,0.355566,0.355566,0.340603,0.206181,0.161945,0.045712,0.015458,0.025553,0.064313,Rose
1,0.103519,0.049621,0.017546,0.055888,0.077677,0.251558,0.333023,0.236442,0.16073,0.119135,...,0.229034,0.206586,0.12261,0.124932,0.229034,0.205363,0.118243,0.034726,0.229034,Sunflower
2,0.2534,0.209042,0.108855,0.083684,0.067237,0.011441,0.058928,0.155104,0.2534,0.2534,...,0.173314,0.098276,0.16696,0.072161,0.245416,0.101899,0.245416,0.245416,0.245416,Lotus


Working of the HOG Descriptor

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
image_path = '/content/kgf.jpg '

# Function to compute and visualize HOG features in a grid
def visualize_hog_grid(image_path, cell_size=8, block_size=2, nbins=9):
    # Read the image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image (64, 128))  # Resize for standard HOG dimensions

    # Initialize HOG descriptor with parameters
    hog = cv2.HOGDescriptor(
        _winSize=(image.shape[1] // cell_size * cell_size, image.shape[0] // cell_size * cell_size),
        _blockSize=(block_size * cell_size, block_size * cell_size),
        _blockStride=(cell_size, cell_size),
        _cellSize=(cell_size, cell_size),
        _nbins=nbins
    )

    # Compute HOG descriptors
    hog_features = hog.compute(image)

    # Calculate the number of cells along x and y
    num_cells_x = image.shape[1] // cell_size
    num_cells_y = image.shape[0] // cell_size

    # Reshape HOG features accordingly
    hog_features = hog_features.reshape((num_cells_y - block_size + 1), (num_cells_x - block_size + 1), block_size, block_size, nbins).transpose((1, 0, 2, 3, 4))

    # Initialize figure
    plt.figure(figsize=(12, 6))
    plt.imshow(image, cmap='gray')
    plt.title("HOG Feature Visualization as Grid")

    # Draw HOG features in a grid
    for y in range(num_cells_y - block_size + 1):
        for x in range(num_cells_x - block_size + 1):
            cell_hog = hog_features[x, y].ravel()
            angle_unit = 180 / nbins
            for bin_idx, magnitude in enumerate(cell_hog):
                angle = bin_idx * angle_unit
                angle_rad = np.deg2rad(angle)
                center_x = x * cell_size + cell_size // 2
                center_y = y * cell_size + cell_size // 2
                dx = magnitude * np.cos(angle_rad) * cell_size // 2
                dy = magnitude * np.sin(angle_rad) * cell_size // 2
                plt.arrow(center_x, center_y, dx, dy, color='red', head_width=1, head_length=1)

    plt.grid(visible=True, color='blue', linewidth=0.5)  # Draw grid lines
    plt.axis("off")
    plt.show()

# Path to the image
image_path = '/content/kgf.jpg '

# Call the function to visualize HOG as grid
visualize_hog_grid(image_path)

TypeError: 'NoneType' object is not callable

**Text** Feature Extraction

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
data = pd.read_csv('/content/sample_sentiment_data.csv')

In [None]:
count = CountVectorizer()
count_res = count.fit_transform(data['review'])

In [None]:
count_df = pd.DataFrame(count_res.toarray(), columns=count.get_feature_names_out())

In [None]:
count_df['sentiment'] = data['sentiment']

In [None]:
count_df.head(10)

Unnamed: 0,absolutely,acting,all,almost,amazing,an,and,asleep,at,avoid,...,watch,watching,weak,were,work,worth,written,year,you,sentiment
0,0,1,0,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,negative
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,negative
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,positive
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,positive
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,negative
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,negative
6,0,0,0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,positive
7,0,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,negative
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,positive
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,positive


In [None]:
# Handling CSV Files
import pandas as pd

# Reading a CSV file
df = pd.read_csv("sample.csv")
print("CSV Data:\n", df.head())

# Writing to a CSV file
df.to_csv("output.csv", index=False)
print("\nData saved to 'output.csv'")


In [None]:
#  Handling JSON Files
import json

# JSON data
json_data = '{"Name": "Alice", "Age": 25, "City": "New York"}'

# Reading JSON
parsed_data = json.loads(json_data)
print("JSON Data:\n", parsed_data)

# Writing JSON
with open("output.json", "w") as f:
    json.dump(parsed_data, f, indent=4)
print("\nJSON saved to 'output.json'")


In [None]:
# Handling XML Files
import xml.etree.ElementTree as ET

# XML data
xml_data = """
<data>
    <person>
        <name>Alice</name>
        <age>25</age>
        <city>New York</city>
    </person>
</data>
"""

# Parsing XML
root = ET.fromstring(xml_data)
print("XML Data:")
for person in root.findall("person"):
    name = person.find("name").text
    age = person.find("age").text
    city = person.find("city").text
    print(f"Name: {name}, Age: {age}, City: {city}")


In [None]:
# Handling Excel Files

import pandas as pd

# Reading an Excel file
df = pd.read_excel("sample.xlsx")
print("Excel Data:\n", df.head())

# Writing to an Excel file
df.to_excel("output.xlsx", index=False)
print("\nData saved to 'output.xlsx'")


In [None]:
Handling Pickle Files
import pickle

# Data to serialize
data = {"Name": "Alice", "Age": 25, "City": "New York"}

# Writing Pickle
with open("data.pkl", "wb") as f:
    pickle.dump(data, f)
print("Pickle file saved as 'data.pkl'")

# Reading Pickle
with open("data.pkl", "rb") as f:
    loaded_data = pickle.load(f)
print("\nPickle Data:\n", loaded_data)


In [None]:
# Handling SQL Databases
import sqlite3

# Create a database connection
conn = sqlite3.connect("sample.db")
cursor = conn.cursor()

# Create a table
cursor.execute("""
CREATE TABLE IF NOT EXISTS people (
    id INTEGER PRIMARY KEY,
    name TEXT,
    age INTEGER,
    city TEXT
)
""")

# Insert data
cursor.execute("INSERT INTO people (name, age, city) VALUES (?, ?, ?)", ("Alice", 25, "New York"))
conn.commit()

# Query data
cursor.execute("SELECT * FROM people")
rows = cursor.fetchall()
print("\nSQL Data:\n", rows)

# Close connection
conn.close()


Python Examples for Databases

In [None]:
# 1. Relational Database (SQLite)
import sqlite3

# Connect to database
conn = sqlite3.connect("example.db")
cursor = conn.cursor()

# Create table
cursor.execute("""
CREATE TABLE IF NOT EXISTS users (
    id INTEGER PRIMARY KEY,
    name TEXT,
    age INTEGER
)
""")

# Insert data
cursor.execute("INSERT INTO users (name, age) VALUES (?, ?)", ("Alice", 30))
conn.commit()

# Query data
cursor.execute("SELECT * FROM users")
rows = cursor.fetchall()
print("Relational Database Data:", rows)

# Close connection
conn.close()


In [None]:
# 2. NoSQL Database (MongoDB)
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["example_db"]

# Insert document
db.users.insert_one({"name": "Alice", "age": 30})

# Query data
for user in db.users.find():
    print("NoSQL Database Data:", user)


In [None]:
# 3. Cloud Database (Azure Cosmos DB Example)
from azure.cosmos import CosmosClient

# Connect to Azure Cosmos DB
url = "https://<your-account>.documents.azure.com:443/"
key = "<your-key>"
client = CosmosClient(url, credential=key)
database = client.create_database_if_not_exists("example_db")
container = database.create_container_if_not_exists(
    id="users",
    partition_key=("/name",),
)

# Insert document
container.upsert_item({"id": "1", "name": "Alice", "age": 30})

# Query data
for item in container.query_items(query="SELECT * FROM users", enable_cross_partition_query=True):
    print("Cloud Database Data:", item)


In [None]:
# 4. API as Data Source
import requests

# Fetch data from API
url = "https://api.open-meteo.com/v1/forecast?latitude=35&longitude=139&hourly=temperature_2m"
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    print("API Data:", data['hourly']['temperature_2m'][:5])  # Display first 5 records
else:
    print("Failed to fetch data")
