<a href="https://colab.research.google.com/github/Devopriya-Tirtho/Data-Analytics-Lab-Session/blob/main/Lab_Session_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

File Input/Output (I/O)

In [None]:
# Opening a file for reading
infile = open("input.txt", "r")
content = infile.read()
print(content)
infile.close()

In [None]:
# Opening a file for writing
outfile = open("output.txt", "w")
outfile.write("Hello, World!\nThis is a demonstration of file I/O.")
outfile.close()

In [None]:
# Opening a file for read/write
rwfile = open("data.txt", "r+")
data = rwfile.read()
print(data)
rwfile.write("\nAdding a new line to the file.")
rwfile.close()

In [None]:
# Reading a file line by line
with open("input.txt", "r") as infile:
    for line in infile:
        print(line.strip())

In [None]:
# Writing multiple lines to a file
with open("output.txt", "w") as outfile:
    for i in range(1, 6):
        outfile.write(f"Line {i}\n")

In [None]:
# Processing a CSV file with split
with open("data.csv", "r") as infile:
    for line in infile:
        fields = line.strip().split(",")
        print(fields)

In [None]:
# Processing a CSV file with the csv module
import csv
with open("data.csv", "r") as infile:
    csv_reader = csv.reader(infile)
    for row in csv_reader:
        print(row)

In [None]:
# Listing files in a directory
import os
print(os.listdir("."))

Reading Website Content Using urllib.request

In [None]:
import urllib.request

# URL of the webpage to read
url = "https://www.example.com"

# Open the URL and read the contents
with urllib.request.urlopen(url) as site:
    content = site.read().decode('utf-8')  # Decoding from bytes to string
    print(content)

Simulating a Web Browser Request

In [None]:
import urllib.request

# URL for the Google search query
url = "http://www.google.com/search?hl=en&q=data+analysis"

# Setting user-agent to simulate a browser
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}

# Creating a request object
request = urllib.request.Request(url, headers=headers)

# Open the URL and read the response
with urllib.request.urlopen(request) as response:
    html_content = response.read().decode('utf-8')
    print(html_content)

Downloading a CSV File from a URL

In [None]:
import urllib.request

# URL of the CSV file
csv_url = "https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv"

# File name to save the downloaded file
file_name = "downloaded_data.csv"

# Downloading the file
urllib.request.urlretrieve(csv_url, file_name)

print(f"CSV file downloaded and saved as {file_name}")

Parsing and Printing CSV Data from a URL

In [None]:
import urllib.request
import csv

# URL of the CSV file
csv_url = "https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv"

# Open the URL and parse the CSV content
with urllib.request.urlopen(csv_url) as site:
    csv_data = csv.reader(site.read().decode('utf-8').splitlines())
    for row in csv_data:
        print(row)

['Month', ' "1958"', ' "1959"', ' "1960"']
['JAN', '  340', '  360', '  417']
['FEB', '  318', '  342', '  391']
['MAR', '  362', '  406', '  419']
['APR', '  348', '  396', '  461']
['MAY', '  363', '  420', '  472']
['JUN', '  435', '  472', '  535']
['JUL', '  491', '  548', '  622']
['AUG', '  505', '  559', '  606']
['SEP', '  404', '  463', '  508']
['OCT', '  359', '  407', '  461']
['NOV', '  310', '  362', '  390']
['DEC', '  337', '  405', '  432']
[]


Error Handling While Accessing a Web Page

In [None]:
import urllib.request
import urllib.error

# URL of the webpage to access
url = "https://www.example.com/nonexistent"

try:
    with urllib.request.urlopen(url) as site:
        content = site.read().decode('utf-8')
        print(content)
except urllib.error.HTTPError as e:
    print(f"HTTP Error: {e.code}")
except urllib.error.URLError as e:
    print(f"URL Error: {e.reason}")

HTTP Error: 404


Importing Commonly Used Modules

In [None]:
# Importing standard Python libraries
import math
import os
import random
import csv
import urllib.request

# Importing third-party libraries for data analysis and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import sklearn

Using NumPy for Scientific Computing

In [None]:
import numpy as np

# Create a 1D NumPy array
array_1d = np.array([1, 2, 3, 4, 5])
print("1D Array:", array_1d)

# Perform basic operations
print("Array Sum:", np.sum(array_1d))
print("Array Mean:", np.mean(array_1d))


Using pandas for Data Manipulation

In [None]:
import pandas as pd

# Create a DataFrame
data = {"Name": ["Alice", "Bob", "Charlie"], "Age": [25, 30, 35]}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

# Perform basic operations
print("Mean Age:", df["Age"].mean())

Visualizing Data with matplotlib

In [None]:
import matplotlib.pyplot as plt

# Create data for a bar chart
categories = ["A", "B", "C"]
values = [10, 20, 15]

# Create the bar chart
plt.bar(categories, values)
plt.title("Bar Chart Example")
plt.xlabel("Categories")
plt.ylabel("Values")
plt.show()

Creating Statistical Visualizations with Seaborn

In [None]:
import seaborn as sns
import pandas as pd

# Create sample data
data = pd.DataFrame({"x": np.random.randn(100), "y": np.random.randn(100)})

# Create a scatterplot
sns.scatterplot(data=data, x="x", y="y")
plt.title("Scatterplot Example")
plt.show()

Performing Linear Regression with SciPy

In [None]:
from scipy.stats import linregress

# Data
x = [1, 2, 3, 4, 5]
y = [2, 4, 5, 4, 5]

# Perform linear regression
slope, intercept, r_value, p_value, std_err = linregress(x, y)
print("Slope:", slope)
print("Intercept:", intercept)

Using scikit-learn for Machine Learning

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Data
X = np.array([[1], [2], [3], [4], [5]])  # Features
y = np.array([2, 4, 5, 4, 5])  # Target

# Create a linear regression model
model = LinearRegression()
model.fit(X, y)

# Make predictions
predictions = model.predict(X)
print("Predictions:", predictions)

Data Visualization (Using matplotlib and seaborn)

In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Sample data
categories = ["Category A", "Category B", "Category C"]
values = [20, 35, 30]

# Bar Chart
plt.bar(categories, values, color='skyblue')
plt.title("Bar Chart Example")
plt.xlabel("Categories")
plt.ylabel("Values")
plt.show()

# Histogram
data = np.random.randn(1000)  # Generating random data
plt.hist(data, bins=20, color='green', alpha=0.7)
plt.title("Histogram Example")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

# Scatterplot using Seaborn
df = pd.DataFrame({
    "x": np.random.randn(100),
    "y": np.random.randn(100)
})
sns.scatterplot(data=df, x="x", y="y")
plt.title("Scatterplot Example")
plt.show()

Scientific Computing- Linear Regression Using Scipy

In [None]:
from scipy.stats import linregress
import numpy as np
import matplotlib.pyplot as plt

# Sample data
x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 5, 4, 5])

# Perform linear regression
slope, intercept, r_value, p_value, std_err = linregress(x, y)

# Print regression details
print(f"Slope: {slope}, Intercept: {intercept}")
print(f"R-squared: {r_value**2}")

# Plot the data and regression line
plt.scatter(x, y, label="Data points")
plt.plot(x, intercept + slope * x, color="red", label="Regression line")
plt.title("Linear Regression Example")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()

k-Means Clustering Using SciPy

In [None]:
from scipy.cluster.vq import kmeans, vq
import numpy as np
import matplotlib.pyplot as plt

# Generate random data
data = np.random.rand(100, 2)

# Perform k-means clustering
num_clusters = 3
centroids, _ = kmeans(data, num_clusters)
cluster_labels, _ = vq(data, centroids)

# Plot data with clusters
for i in range(num_clusters):
    cluster_points = data[cluster_labels == i]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {i+1}")

plt.scatter(centroids[:, 0], centroids[:, 1], color="red", marker="x", s=100, label="Centroids")
plt.title("k-Means Clustering Example")
plt.legend()
plt.show()

Database Connectivity (Using mysql.connector)

In [None]:
pip install mysql.connector

Collecting mysql.connector
  Downloading mysql-connector-2.2.9.tar.gz (11.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: mysql.connector
  Building wheel for mysql.connector (setup.py) ... [?25l[?25hdone
  Created wheel for mysql.connector: filename=mysql_connector-2.2.9-cp311-cp311-linux_x86_64.whl size=247949 sha256=cf10e442586fe6a84c0ef1b1baed22a6ba6a541362d7c631c28e7ffc692754a8
  Stored in directory: /root/.cache/pip/wheels/17/cd/ed/2d49e9bac69cf09382e4c7cc20a2511202b48324b87db26019
Successfully built mysql.connector
Installing collected packages: mysql.connector
Successfully installed mysql.connector-2.2.9


In [None]:
import mysql.connector

# Connect to a MySQL database
try:
    connection = mysql.connector.connect(
        host="localhost",
        user="your_username",
        password="your_password",
        database="your_database"
    )

    # Create a cursor to execute queries
    cursor = connection.cursor()

    # Example query
    query = "SELECT * FROM your_table LIMIT 5"
    cursor.execute(query)

    # Fetch and print the results
    for row in cursor.fetchall():
        print(row)

except mysql.connector.Error as err:
    print(f"Error: {err}")
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()

Error: 2003: Can't connect to MySQL server on 'localhost:3306' (111 Connection refused)


NameError: name 'connection' is not defined

Basic Map-Reduce Example

In [None]:
# Import the functools library for reduce
import functools

# Sample data
data = [1, 2, 3, 4, 5]

# Map step: Multiply each element by 2
mapped_data = list(map(lambda x: x * 2, data))
print("Mapped Data:", mapped_data)

# Reduce step: Sum all the elements
reduced_result = functools.reduce(lambda x, y: x + y, mapped_data)
print("Reduced Result:", reduced_result)

Advanced Map-Reduce Example (Filter and Product Calculation)

In [None]:
# Import the functools library for reduce
import functools

# Sample data
data = [1, 2, 3, 4, 5, 6]

# Map step: Subtract 2 from each element
mapped_data = list(map(lambda x: x - 2, data))
print("Mapped Data:", mapped_data)

# Filter step: Keep only elements greater than 0
filtered_data = list(filter(lambda x: x > 0, mapped_data))
print("Filtered Data:", filtered_data)

# Reduce step: Calculate the product of all remaining elements
reduced_result = functools.reduce(lambda x, y: x * y, filtered_data)
print("Reduced Result:", reduced_result)

Question 1

In [None]:
# Question 1: Linear Regression
# Write a function `perform_linear_regression(x, y)` that:
# 1. Accepts two lists of numbers `x` and `y` as inputs.
# 2. Calculates the slope and intercept of the linear regression line using the formulas:
#    - Slope: m = (mean(x * y) - mean(x) * mean(y)) / (mean(x^2) - (mean(x))^2)
#    - Intercept: b = mean(y) - m * mean(x)
# 3. Returns the slope and intercept as a tuple.
# Example Input:
# x = [1, 2, 3, 4, 5]
# y = [2, 4, 5, 4, 5]
# Expected Output:
# Slope: 0.3, Intercept: 3.4

# Skeleton of the function
def perform_linear_regression(x, y):
    # Your code here
    pass

# Function call
x = [1, 2, 3, 4, 5]
y = [2, 4, 5, 4, 5]
slope, intercept = perform_linear_regression(x, y)
print(f"Slope: {slope}, Intercept: {intercept}")

Question 2: Data Visualization

In [None]:
# Question 2: Data Visualization
# Write a function `plot_bar_chart(categories, values)` that:
# 1. Accepts two lists as inputs:
#    - `categories`: A list of category names (strings).
#    - `values`: A list of corresponding values (numbers).
# 2. Creates a bar chart using `matplotlib` with:
#    - Categories on the x-axis.
#    - Values on the y-axis.
#    - A title "Bar Chart Example."
# 3. Displays the chart.
# Example Input:
# categories = ["A", "B", "C"]
# values = [10, 20, 15]
# Expected Output:
# A bar chart with "A", "B", "C" on the x-axis and bars representing values 10, 20, and 15.

# Skeleton of the function
def plot_bar_chart(categories, values):
    # Your code here
    pass

# Function call
categories = ["A", "B", "C"]
values = [10, 20, 15]
plot_bar_chart(categories, values)