#### CSV

In [8]:
import pandas as pd

# Download and load the Iris dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
df = pd.read_csv(url, names=columns,sep=',')

# Display the first few rows
print(df.head())


   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [9]:
# Get basic information about the dataset
print(df.info())

# Summary statistics for numerical columns
print(df.describe())

# Check for missing values
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.50000

In [10]:
# Select a single column
sepal_length = df['sepal_length']
print(sepal_length.head())

# Select multiple columns
subset = df[['sepal_length', 'sepal_width']]
print(subset.head())

# Select a specific row by index
print(df.iloc[0])

# Select a specific subset of rows and columns
subset = df.loc[0:4, ['sepal_length', 'class']]
print(subset)


0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal_length, dtype: float64
   sepal_length  sepal_width
0           5.1          3.5
1           4.9          3.0
2           4.7          3.2
3           4.6          3.1
4           5.0          3.6
sepal_length            5.1
sepal_width             3.5
petal_length            1.4
petal_width             0.2
class           Iris-setosa
Name: 0, dtype: object
   sepal_length        class
0           5.1  Iris-setosa
1           4.9  Iris-setosa
2           4.7  Iris-setosa
3           4.6  Iris-setosa
4           5.0  Iris-setosa


In [11]:
# Filter rows where sepal_length > 5.0
filtered_df = df[df['sepal_length'] > 5.0]
print(filtered_df.head())

# Filter rows where class is 'Iris-setosa'
setosa_df = df[df['class'] == 'Iris-setosa']
print(setosa_df.head())


    sepal_length  sepal_width  petal_length  petal_width        class
0            5.1          3.5           1.4          0.2  Iris-setosa
5            5.4          3.9           1.7          0.4  Iris-setosa
10           5.4          3.7           1.5          0.2  Iris-setosa
14           5.8          4.0           1.2          0.2  Iris-setosa
15           5.7          4.4           1.5          0.4  Iris-setosa
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [12]:
# Sort by sepal_length in ascending order
sorted_df = df.sort_values(by='sepal_length')
print(sorted_df.head())

# Sort by sepal_length in descending order
sorted_df_desc = df.sort_values(by='sepal_length', ascending=False)
print(sorted_df_desc.head())


    sepal_length  sepal_width  petal_length  petal_width        class
13           4.3          3.0           1.1          0.1  Iris-setosa
8            4.4          2.9           1.4          0.2  Iris-setosa
42           4.4          3.2           1.3          0.2  Iris-setosa
38           4.4          3.0           1.3          0.2  Iris-setosa
41           4.5          2.3           1.3          0.3  Iris-setosa
     sepal_length  sepal_width  petal_length  petal_width           class
131           7.9          3.8           6.4          2.0  Iris-virginica
122           7.7          2.8           6.7          2.0  Iris-virginica
118           7.7          2.6           6.9          2.3  Iris-virginica
117           7.7          3.8           6.7          2.2  Iris-virginica
135           7.7          3.0           6.1          2.3  Iris-virginica


In [13]:
# Group by 'class' and calculate the mean of each numeric column
grouped_df = df.groupby('class').mean()
print(grouped_df)


                 sepal_length  sepal_width  petal_length  petal_width
class                                                                
Iris-setosa             5.006        3.418         1.464        0.244
Iris-versicolor         5.936        2.770         4.260        1.326
Iris-virginica          6.588        2.974         5.552        2.026


In [14]:
# Fill missing values with a specific value (e.g., 0)
df_filled = df.fillna(0)

# Drop rows with missing values
df_dropped = df.dropna()


In [15]:
# Create another DataFrame to merge
additional_data = pd.DataFrame({
    'class': ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'],
    'color': ['red', 'blue', 'green']
})

# Merge based on the 'class' column
merged_df = pd.merge(df, additional_data, on='class')
print(merged_df.head())


   sepal_length  sepal_width  petal_length  petal_width        class color
0           5.1          3.5           1.4          0.2  Iris-setosa   red
1           4.9          3.0           1.4          0.2  Iris-setosa   red
2           4.7          3.2           1.3          0.2  Iris-setosa   red
3           4.6          3.1           1.5          0.2  Iris-setosa   red
4           5.0          3.6           1.4          0.2  Iris-setosa   red


In [16]:
# Create a pivot table summarizing the average sepal length by class
pivot_table = pd.pivot_table(df, values='sepal_length', index='class', aggfunc='mean')
print(pivot_table)


                 sepal_length
class                        
Iris-setosa             5.006
Iris-versicolor         5.936
Iris-virginica          6.588


#### JSON

In [2]:
import requests

# Download and load JSON data
url = "https://jsonplaceholder.typicode.com/todos"
response = requests.get(url)
todos = response.json()

# Display the first few items
for item in todos[:5]:
    print(item)


{'userId': 1, 'id': 1, 'title': 'delectus aut autem', 'completed': False}
{'userId': 1, 'id': 2, 'title': 'quis ut nam facilis et officia qui', 'completed': False}
{'userId': 1, 'id': 3, 'title': 'fugiat veniam minus', 'completed': False}
{'userId': 1, 'id': 4, 'title': 'et porro tempora', 'completed': True}
{'userId': 1, 'id': 5, 'title': 'laboriosam mollitia et enim quasi adipisci quia provident illum', 'completed': False}


#### XML

In [12]:
import requests
import xml.etree.ElementTree as ET

# Download and parse XML data
url = "https://filesampleshub.com/download/code/xml/sample1.xml"
response = requests.get(url)
root = ET.fromstring(response.content,parser=)

# # Display the first few entries
# for cd in root.findall('CD')[:5]:
#     title = cd.find('TITLE').text
#     artist = cd.find('ARTIST').text
#     print(f"Title: {title}, Artist: {artist}")



AttributeError: 'str' object has no attribute 'feed'

#### Mongo DB  

In [15]:
!pip install pymongo



In [4]:
from pymongo import MongoClient

# Connect to MongoDB (Assuming MongoDB is running locally)
client = MongoClient("mongodb://localhost:27017/")

# Create or connect to a database
db = client['sample_db']

dblist = client.list_database_names()
if "mydatabase" in dblist:
  print("The database exists.")

# Insert sample documents into the collection
# sample_data = [
#     {"name": "John", "age": 30, "city": "New York"},
#     {"name": "Jane", "age": 25, "city": "Los Angeles"},
#     {"name": "Mike", "age": 32, "city": "Chicago"}
# ]
# collection.insert_many(sample_data)

# # Retrieve and print all documents from the collection
# for document in collection.find():
#     print(document)

# # Query a specific document
# query = {"name": "John"}
# john_data = collection.find_one(query)
# print("\nQueried Data:", john_data)

# # Cleanup: Dropping the collection after use (optional)
# # db.drop_collection("sample_collection")


ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 66d04ae4909f78c1e3002223, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>