ANS1

In [4]:
pip install psycopg2


Collecting psycopg2
  Downloading psycopg2-2.9.6-cp310-cp310-win_amd64.whl (1.2 MB)
     ---------------------------------------- 1.2/1.2 MB 995.1 kB/s eta 0:00:00
Installing collected packages: psycopg2
Successfully installed psycopg2-2.9.6
Note: you may need to restart the kernel to use updated packages.


In [2]:
    ## data ingestion pipeline that collects and stores data from various sources such as databases, APIs, and streaming platforms.


In [5]:
# Required libraries
import requests
import json
import csv
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine

# Database connection details
db_username = "your_db_username"
db_password = "your_db_password"
db_host = "your_db_host"
db_name = "your_db_name"

# API endpoint
api_endpoint = "https://api.example.com/data"

# Streaming platform details
streaming_platform_host = "your_streaming_platform_host"
streaming_platform_topic = "your_streaming_platform_topic"

# Connect to the database
db_engine = create_engine(f"postgresql://{db_username}:{db_password}@{db_host}/{db_name}")

# Function to collect and store data from APIs
def ingest_data_from_api():
    response = requests.get(api_endpoint)
    if response.status_code == 200:
        data = response.json()
        # Store data in the database
        df = pd.DataFrame(data)
        df.to_sql("api_data", con=db_engine, if_exists="append", index=False)
    else:
        print("Failed to fetch data from API.")

# Function to collect and store data from streaming platforms
def ingest_data_from_streaming_platform():
    # Connect to the streaming platform
    streaming_platform = connect_to_streaming_platform(streaming_platform_host)
    # Subscribe to the topic
    streaming_platform.subscribe(streaming_platform_topic)
    
    # Start consuming messages
    for message in streaming_platform.consume_messages():
        data = json.loads(message)
        # Store data in the database
        df = pd.DataFrame(data)
        df.to_sql("streaming_data", con=db_engine, if_exists="append", index=False)

# Function to collect and store data from databases
def ingest_data_from_database():
    # Connect to the source database
    source_db_engine = create_engine(f"postgresql://{db_username}:{db_password}@{db_host}/{db_name}_source")
    
    # Fetch data from the source database
    query = "SELECT * FROM source_table"
    data = pd.read_sql(query, con=source_db_engine)
    
    # Transform data if required
    data["timestamp"] = datetime.now()
    
    # Store data in the target database
    data.to_sql("target_table", con=db_engine, if_exists="append", index=False)

# Call the functions to ingest data
ingest_data_from_api()
ingest_data_from_streaming_platform()
ingest_data_from_database()



## Implement a real-time data ingestion pipeline for processing sensor data from IoT devices.

# Required libraries
import requests
import json
from datetime import datetime
from kafka import KafkaProducer

# Kafka broker details
kafka_broker = "your_kafka_broker"

# Function to collect and process sensor data from IoT devices
def ingest_sensor_data(device_id, sensor_data):
    # Perform data processing operations
    processed_data = process_sensor_data(sensor_data)
    
    # Add timestamp
    processed_data["timestamp"] = datetime.now().isoformat()
    
    # Convert data to JSON
    data_json = json.dumps(processed_data)
    
    # Send data to Kafka topic
    producer = KafkaProducer(bootstrap_servers=kafka_broker)
    producer.send("sensor_data_topic", value=data_json.encode("utf-8"))

# Call the function to ingest sensor data
ingest_sensor_data("device_id_1", {"temperature": 25, "humidity": 60})



## Develop a data ingestion pipeline that handles data from different file formats (CSV, JSON, etc.) and performs data validation and cleansing


# Required libraries
import pandas as pd

# Function to ingest data from CSV files
def ingest_data_from_csv(file_path):
    # Read CSV file
    df = pd.read_csv(file_path)
    
    # Perform data validation and cleansing
    df = validate_and_cleanse_data(df)
    
    # Store data in the database
    df.to_sql("csv_data", con=db_engine, if_exists="append", index=False)

# Function to ingest data from JSON files
def ingest_data_from_json(file_path):
    # Read JSON file
    with open(file_path) as f:
        data = json.load(f)
    
    # Perform data validation and cleansing
    data = validate_and_cleanse_data(data)
    
    # Store data in the database
    df = pd.DataFrame(data)
    df.to_sql("json_data", con=db_engine, if_exists="append", index=False)

# Call the functions to ingest data
ingest_data_from_csv("data.csv")
ingest_data_from_json("data.json")



ConnectionError: HTTPSConnectionPool(host='api.example.com', port=443): Max retries exceeded with url: /data (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000019A1806CE50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

ANS2

In [6]:
 ##  Build a machine learning model to predict customer churn based on a given dataset

In [8]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


dataset = pd.read_csv("customer_churn.csv")


X = dataset.drop("Churn", axis=1)
y = dataset["Churn"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier()

model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)



 ## Develop a model training pipeline that incorporates feature engineering techniques such as one-hot encoding, feature scaling

    


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA












encoder = OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded.toarray())
X_test_scaled = scaler.transform(X_test_encoded.toarray())


pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)


model = RandomForestClassifier()

model.fit(X_train_pca, y_train)


y_pred = model.predict(X_test_pca)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)



## 

# Required libraries
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator

tf.random.set_seed(42)


base_model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))


for layer in base_model.layers:
    layer.trainable = False


model = Sequential()
model.add(base_model)
model.add(GlobalAveragePooling2D())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))


model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

train_datagen = ImageDataGenerator(
    rescale=1.0/255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)
test_datagen = ImageDataGenerator(rescale=1.0/255)


train_generator = train_datagen.flow_from_directory(
    "train_data",
    target_size=(224, 224),
    batch_size=32,
    class_mode="binary"
)
test_generator = test_datagen.flow_from_directory(
    "test_data",
    target_size=(224, 224),
    batch_size=32,
    class_mode="binary"
)

# Train the model
model.fit(
    train_generator,
    epochs=10,
    validation_data=test_generator
)


FileNotFoundError: [Errno 2] No such file or directory: 'customer_churn.csv'

ANS3

In [9]:
## cross-validation to evaluate the performance of a regression model for predicting housing prices.

# Required libraries
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

# Load the dataset
dataset = pd.read_csv("housing_data.csv")

# Split the dataset into features and target variable
X = dataset.drop("Price", axis=1)
y = dataset["Price"]

# Initialize the regression model
model = LinearRegression()

# Perform cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error")

# Print the mean squared error scores
print("Mean Squared Error Scores:", -scores)
print("Mean Squared Error Mean:", -scores.mean())

## Perform model validation using different evaluation metrics such as accuracy, precision, recall, and F1 score for a binary classification problem.
 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



# Initialize the classification model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


## . Design a model validation strategy that incorporates stratified sampling to handle imbalanced datasets.


from sklearn.model_selection import train_test_split, StratifiedKFold


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    y_val_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_val_pred)
    scores.append(score)

mean_score = sum(scores) / len(scores)
print("Mean Cross-Validation Accuracy:", mean_score)


FileNotFoundError: [Errno 2] No such file or directory: 'housing_data.csv'

ANS4

In [10]:
# Required libraries
from flask import Flask, request, jsonify
import your_model_module  # Import your machine learning model module

# Create Flask application
app = Flask(__name__)

# Endpoint for real-time recommendations
@app.route('/recommend', methods=['POST'])
def recommend():
    data = request.json  # Get user interaction data from the request
    # Perform preprocessing on the data if required
    result = your_model_module.generate_recommendations(data)
    return jsonify(result)

# Run the Flask application
if __name__ == '__main__':
    app.run(debug=True)

    
    
    
# Required libraries
import boto3

# Create an Elastic Beanstalk application
eb_client = boto3.client('elasticbeanstalk')
eb_client.create_application(
    ApplicationName='your-application-name',
    PlatformArn='arn:aws:elasticbeanstalk:us-west-2::platform/Docker running on 64bit Amazon Linux/2.17.1'
)

# Create an Elastic Beanstalk environment
eb_client.create_environment(
    ApplicationName='your-application-name',
    EnvironmentName='your-environment-name',
    SolutionStackName='64bit Amazon Linux 2 v3.4.0 running Docker'
)

# Configure CodePipeline
codepipeline_client = boto3.client('codepipeline')
codepipeline_client.create_pipeline(
    pipeline={
        'name': 'your-pipeline-name',
        'roleArn': 'your-role-arn',
        'artifactStore': {
            'type': 'S3',
            'location': 'your-s3-bucket-name'
        },
        'stages': [
            {
                'name': 'Source',
                'actions': [
                    {
                        'name': 'SourceAction',
                        'actionTypeId': {
                            'category': 'Source',
                            'owner': 'AWS',
                            'provider': 'CodeCommit',
                            'version': '1'
                        },
                        'configuration': {
                            'BranchName': 'your-branch-name',
                            'RepositoryName': 'your-repository-name'
                        },
                        'outputArtifacts': [
                            {
                                'name': 'SourceOutput'
                            }
                        ]
                    }
                ]
            },
            {
                'name': 'Build',
                'actions': [
                    {
                        'name': 'BuildAction',
                        'actionTypeId': {
                            'category': 'Build',
                            'owner': 'AWS',
                            'provider': 'CodeBuild',
                            'version': '1'
                        },
                        'configuration': {
                            'ProjectName': 'your-codebuild-project-name'
                        },
                        'inputArtifacts': [
                            {
                                'name': 'SourceOutput'
                            }
                        ],
                        'outputArtifacts': [
                            {
                                'name': 'BuildOutput'
                            }
                        ]
                    }
                ]
            },
            {
                'name': 'Deploy',
                'actions': [
                    {
                        'name': 'DeployAction',
                        'actionTypeId': {
                            'category': 'Deploy',
                            'owner': 'AWS',
                            'provider': 'ElasticBeanstalk',
                            'version': '1'
                        },
                        'configuration': {
                            'ApplicationName': 'your-application-name',
                            'EnvironmentName': 'your-environment-name'
                        },
                        'inputArtifacts': [
                            {
                                'name': 'BuildOutput'
                            }
                        ]
                    }
                ]
            }
        ]
    }
)


ModuleNotFoundError: No module named 'your_model_module'