In [None]:
# Import necessary libraries
import torch
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import evaluate
import gradio as gr

# Load the AG News dataset
dataset = load_dataset("ag_news")

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Prepare datasets for PyTorch
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

# Load the BERT model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# Load evaluation metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

# Define evaluation metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted"),
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Define inference function for Gradio
def classify_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits).item()
    labels = ["World", "Sports", "Business", "Sci/Tech"]
    return {labels[i]: float(outputs.logits[0][i]) for i in range(4)}

# Create Gradio interface
iface = gr.Interface(
    fn=classify_news,
    inputs=gr.Textbox(lines=2, placeholder="Enter a news headline..."),
    outputs=gr.Label(num_top_classes=4),
    title="News Topic Classifier",
    description="Classify a news headline into World, Sports, Business, or Sci/Tech."
)

# Launch the Gradio app
iface.launch()



# Task 2


In [None]:

!pip install seaborn scikit-learn pandas matplotlib -q


In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib


In [None]:

url = "https://raw.githubusercontent.com/blastchar/telco-customer-churn/master/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(url)
df = df[df["TotalCharges"] != " "]
df["TotalCharges"] = df["TotalCharges"].astype(float)
df.drop(["customerID"], axis=1, inplace=True)
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
df.head()


In [None]:

# Define categorical and numerical columns
categorical_cols = df.select_dtypes(include="object").columns.tolist()
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_cols.remove("Churn")

# Preprocessing
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [None]:

# Create full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Split dataset
X = df.drop("Churn", axis=1)
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))


In [None]:

# GridSearch for Random Forest
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 10]
}

grid = GridSearchCV(pipeline_rf, param_grid, cv=3)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print(classification_report(y_test, grid.predict(X_test)))


In [None]:

# Save best model
joblib.dump(grid.best_estimator_, "telco_churn_pipeline.joblib")


# Task 3

In [None]:
!pip install opendatasets scikit-learn tensorflow pandas matplotlib seaborn -q


In [None]:
import opendatasets as od
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Concatenate, Input
from tensorflow.keras.optimizers import Adam


In [None]:
# Download housing dataset from Kaggle
od.download("https://www.kaggle.com/datasets/kumarkalyan/houses-dataset")
data_dir = "houses-dataset"
df = pd.read_csv(os.path.join(data_dir, "HousesInfo.txt"))
df.head()


In [None]:
# Filter entries with existing images
df = df[df["Image"] != " "]
df["Image"] = df["Image"].apply(lambda x: os.path.join(data_dir, "Houses Dataset", x))
df = df[df["Image"].apply(os.path.exists)].reset_index(drop=True)


In [None]:
# Load VGG16 for image feature extraction
IMG_SIZE = (224, 224)
base_model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
feature_extractor = Model(inputs=base_model.input, outputs=base_model.output)

def extract_image_features(path):
    img = load_img(path, target_size=IMG_SIZE)
    img_array = img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = feature_extractor.predict(img_array, verbose=0)
    return features.flatten()

# Extract for first 100 samples for speed
image_features = np.array([extract_image_features(p) for p in df["Image"][:100]])


In [None]:
# Tabular data and price labels
tabular_features = df[["Bedrooms", "Bathrooms", "Area", "Stories"]].iloc[:100].values
prices = df["Price"].iloc[:100].values


In [None]:
X_tab_train, X_tab_test, X_img_train, X_img_test, y_train, y_test = train_test_split(
    tabular_features, image_features, prices, test_size=0.2, random_state=42
)


In [None]:
tab_input = Input(shape=(X_tab_train.shape[1],))
img_input = Input(shape=(X_img_train.shape[1],))

x1 = Dense(64, activation='relu')(tab_input)
x2 = Dense(128, activation='relu')(img_input)

combined = Concatenate()([x1, x2])
z = Dense(64, activation='relu')(combined)
z = Dense(1)(z)

model = Model(inputs=[tab_input, img_input], outputs=z)
model.compile(optimizer=Adam(1e-3), loss='mse')
model.summary()


In [None]:
model.fit([X_tab_train, X_img_train], y_train, epochs=10, batch_size=8, verbose=1)


In [None]:
preds = model.predict([X_tab_test, X_img_test])
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}")
