In [1]:
#Assignment 69

In [3]:
#ans 1:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif

# Load the dataset
diabetes_df = pd.read_csv('diabetes.csv')

# Separate the target variable from the features
X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

# Create a feature selection pipeline using SelectKBest and f_classif
feature_selection_pipeline = Pipeline([
    ('select', SelectKBest(f_classif, k=5))
])

# Create a numerical pipeline using SimpleImputer and StandardScaler
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Create a categorical pipeline using SimpleImputer and OneHotEncoder
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

# Combine the numerical and categorical pipelines using ColumnTransformer
preprocessing_pipeline = ColumnTransformer([
    ('numerical', numerical_pipeline, X.select_dtypes(include='number').columns),
    ('categorical', categorical_pipeline, X.select_dtypes(include='object').columns)
])

# Combine the feature selection pipeline and preprocessing pipeline
full_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('feature_selection', feature_selection_pipeline),
    ('classifier', RandomForestClassifier())
])

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
full_pipeline.fit(X_train, y_train)

# Predict on the test data and evaluate the accuracy
from sklearn.metrics import accuracy_score
y_pred = full_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 76.62%


In [4]:
#ans 2:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

# Load the iris dataset
iris = load_iris()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=42)

# Define the pipelines for the random forest classifier and logistic regression classifier
rf_pipeline = Pipeline([('rf', RandomForestClassifier())])
lr_pipeline = Pipeline([('lr', LogisticRegression())])

# Combine the two pipelines into a voting classifier
voting_clf = VotingClassifier([('rf', rf_pipeline), ('lr', lr_pipeline)])

# Train the voting classifier on the training set
voting_clf.fit(X_train, y_train)

# Evaluate the accuracy of the voting classifier on the testing set
accuracy = voting_clf.score(X_test, y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 100.00%
