In [None]:
import os 
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from transformers import DistilBertTokenizer, DistilBertModel
from PIL import Image
import pandas as pd  # For reading Excel file
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset Class
class ImageTextDataset(Dataset):
    def __init__(self, image_paths, texts, labels, tokenizer):
        self.image_paths = image_paths
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load and preprocess image
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")
        transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        image_tensor = transform(image)

        # Tokenize text
        text = self.texts[idx]
        text_tokens = self.tokenizer(
            text, return_tensors="pt", truncation=True, padding="max_length", max_length=512
        )
        text_tokens = {key: value.squeeze(0) for key, value in text_tokens.items()}

        # Label
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        return image_tensor, text_tokens, label


# Model Definition
class ImageTextClassifier(nn.Module):
    def __init__(self):
        super(ImageTextClassifier, self).__init__()

        # CNN for image analysis (using ResNet-101)
        self.image_model = models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V1)
        self.image_model.fc = nn.Linear(self.image_model.fc.in_features, 512)

        # DistilBERT for text analysis
        self.text_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text_fc = nn.Linear(768, 512)

        # Classifier combining both modalities
        self.classifier = nn.Linear(512 + 512, 2)

    def forward(self, image, text):
        # Image features
        image_features = self.image_model(image)

        # Text features
        text_features = self.text_model(**text).last_hidden_state[:, 0, :]
        text_features = self.text_fc(text_features)

        # Combine features
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.classifier(combined_features)

        return output


# Train Function
def train_model(model, train_loader, val_loader, epochs=5):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for images, texts, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)
            texts = {key: value.to(device) for key, value in texts.items()}

            # Forward pass
            outputs = model(images, texts)
            loss = criterion(outputs, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validation
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, texts, labels in val_loader:
                images = images.to(device)
                labels = labels.to(device)
                texts = {key: value.to(device) for key, value in texts.items()}

                outputs = model(images, texts)
                predictions = torch.argmax(outputs, dim=1)

                correct += (predictions == labels).sum().item()
                total += labels.size(0)

        accuracy = correct / total
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}")
        # Save model checkpoint
        torch.save(model.state_dict(), "image_text_classifier.pth")


def prepare_dataset(excel_file, images_folder):
    # Read the Excel file
    df = pd.read_excel(excel_file)

    # List all image files in the images folder
    image_files = os.listdir(images_folder)
    image_files = [os.path.join(images_folder, img) for img in image_files]

    # Extract 'Text' and 'Label' columns
    texts = df['Text'].tolist()
    labels = df['Label'].tolist()

    # Convert string labels to integer labels
    label_map = {label: idx for idx, label in enumerate(set(labels))}
    labels = [label_map[label] for label in labels]

    # Train-Test Split
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )
    train_images, val_images = train_test_split(image_files, test_size=0.2, random_state=42)

    # Tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    # Create datasets and loaders
    train_dataset = ImageTextDataset(train_images, train_texts, train_labels, tokenizer)
    val_dataset = ImageTextDataset(val_images, val_texts, val_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

    return train_loader, val_loader


# Main Execution
if __name__ == "__main__":
    # Paths for your dataset
    excel_file = r"C:\Users\Lenovo\Desktop\model train\balanced_text_classification_dataset.xlsx"  # Path to your custom Excel file
    images_folder = r"C:\Users\Lenovo\Downloads\archive\natural_images\real images"  # Path to folder containing images

    # Prepare data
    train_loader, val_loader = prepare_dataset(excel_file, images_folder)

    # Initialize and train model
    model = ImageTextClassifier()
    train_model(model, train_loader, val_loader, epochs=5)

    print("Model training complete and saved.")


In [1]:
import pandas as pd

# Sample data for student scores (with some missing values)
data = {
    'Student': ['A', 'B', 'C', 'D', 'E'],
    'Math': [85, 90, None, 95, 92],
    'Science': [88, None, 90, 85, 91],
    'English': [None, 85, 78, 88, 90]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Forward fill to interpolate missing values
df.fillna(method='ffill', inplace=True)

# Display the DataFrame after forward fill
print("\nDataFrame after forward fill:")
print(df)


Original DataFrame:
  Student  Math  Science  English
0       A  85.0     88.0      NaN
1       B  90.0      NaN     85.0
2       C   NaN     90.0     78.0
3       D  95.0     85.0     88.0
4       E  92.0     91.0     90.0

DataFrame after forward fill:
  Student  Math  Science  English
0       A  85.0     88.0      NaN
1       B  90.0     88.0     85.0
2       C  90.0     90.0     78.0
3       D  95.0     85.0     88.0
4       E  92.0     91.0     90.0


  df.fillna(method='ffill', inplace=True)


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Sample student academic data (features: marks in Math, Science, and English)
data = {
    'Math': [85, 90, 75, 95, 88, 72, 92, 80],
    'Science': [88, 85, 92, 95, 89, 78, 91, 82],
    'English': [78, 85, 70, 88, 90, 74, 92, 80],
    'Performance': ['Pass', 'Pass', 'Fail', 'Pass', 'Pass', 'Fail', 'Pass', 'Fail']  # Target variable
}

# Convert the dictionary into a DataFrame
df = pd.DataFrame(data)

# Split the data into features and target
X = df[['Math', 'Science', 'English']]  # Features
y = df['Performance']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the output
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print("\nConfusion Matrix:")
print(conf_matrix)

print("\nClassification Report:")
print(class_report)

# Display test predictions
print("\nTest Predictions:")
for idx, prediction in enumerate(y_pred):
    print(f"Test Sample {idx+1}: Predicted - {prediction}, Actual - {y_test.values[idx]}")


Model Accuracy: 66.67%

Confusion Matrix:
[[1 0]
 [1 1]]

Classification Report:
              precision    recall  f1-score   support

        Fail       0.50      1.00      0.67         1
        Pass       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3


Test Predictions:
Test Sample 1: Predicted - Pass, Actual - Pass
Test Sample 2: Predicted - Fail, Actual - Fail
Test Sample 3: Predicted - Fail, Actual - Pass


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Sample dataset for loan risk classification
data_loan = {
    'Income': [4000, 5000, 3000, 8000, 7000, 2000, 6000, 4500],
    'Credit_Score': [700, 750, 650, 800, 720, 580, 690, 710],
    'Loan Amount': [2000, 2500, 1500, 5000, 3000, 1000, 4000, 2700],
    'Risk': ['Low', 'Low', 'High', 'Low', 'Low', 'High', 'Low', 'High']
}

# Create DataFrame
df_loan = pd.DataFrame(data_loan)

# Split data into features and target
X = df_loan[['Income', 'Credit_Score', 'Loan Amount']]
y = df_loan['Risk']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
model_loan = LogisticRegression(random_state=42)
model_loan.fit(X_train, y_train)

# Predict on test data
y_pred_loan = model_loan.predict(X_test)

# Evaluate model
print(f"Accuracy: {accuracy_score(y_test, y_pred_loan) * 100:.2f}%")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_loan))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_loan))


Accuracy: 33.33%

Confusion Matrix:
[[1 0]
 [2 0]]

Classification Report:
              precision    recall  f1-score   support

        High       0.33      1.00      0.50         1
         Low       0.00      0.00      0.00         2

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
from sklearn.naive_bayes import MultinomialNB

# Sample email dataset (features: word frequency, email length)
data_spam = {
    'Email Length': [100, 50, 300, 20, 150, 250, 90, 400],
    'Spam Keywords': [3, 1, 5, 0, 2, 6, 1, 7],
    'Spam': ['Not Spam', 'Spam', 'Spam', 'Not Spam', 'Not Spam', 'Spam', 'Not Spam', 'Spam']
}

# Create DataFrame
df_spam = pd.DataFrame(data_spam)

# Split dataset
X_spam = df_spam[['Email Length', 'Spam Keywords']]
y_spam = df_spam['Spam']

X_train_spam, X_test_spam, y_train_spam, y_test_spam = train_test_split(X_spam, y_spam, test_size=0.3, random_state=42)

# Train Naive Bayes model
model_spam = MultinomialNB()
model_spam.fit(X_train_spam, y_train_spam)

# Predict and evaluate
y_pred_spam = model_spam.predict(X_test_spam)

print(f"Accuracy: {accuracy_score(y_test_spam, y_pred_spam) * 100:.2f}%")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_spam, y_pred_spam))
print("\nClassification Report:")
print(classification_report(y_test_spam, y_pred_spam))


Accuracy: 33.33%

Confusion Matrix:
[[1 0]
 [2 0]]

Classification Report:
              precision    recall  f1-score   support

    Not Spam       0.33      1.00      0.50         1
        Spam       0.00      0.00      0.00         2

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Sample patient symptom data
data = {
    'Fever': [1, 0, 1, 0, 1, 0],
    'Cough': [1, 0, 1, 1, 0, 1],
    'Fatigue': [0, 1, 1, 0, 1, 1],
    'Disease': ['Disease A', 'Disease B', 'Disease A', 'Disease B', 'Disease A', 'Disease B']  # Target variable
}

# Convert the dictionary into a DataFrame
df = pd.DataFrame(data)

# Features and target variable
X = df[['Fever', 'Cough', 'Fatigue']]  # Features
y = df['Disease']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

   Disease A       1.00      1.00      1.00         1
   Disease B       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [23]:
from sklearn.linear_model import LinearRegression

# Sample house dataset
data_houses = {
    'Square Footage': [1500, 2000, 1200, 2500, 1800, 1600, 2200, 2100],
    'Num Bedrooms': [3, 4, 2, 5, 3, 2, 4, 4],
    'Price': [300000, 400000, 200000, 500000, 350000, 250000, 450000, 420000]
}

df_houses = pd.DataFrame(data_houses)

X_houses = df_houses[['Square Footage', 'Num Bedrooms']]
y_houses = df_houses['Price']

X_train_house, X_test_house, y_train_house, y_test_house = train_test_split(X_houses, y_houses, test_size=0.2, random_state=42)

# Train Linear Regression model
model_house = LinearRegression()
model_house.fit(X_train_house, y_train_house)

# Predict house prices
y_pred_house = model_house.predict(X_test_house)

print("\nCoefficients:", model_house.coef_)
print("Intercept:", model_house.intercept_)
print("\nPredicted Prices:", y_pred_house)
print("\nActual Prices:", y_test_house.values)



Coefficients: [  202.65957447 11542.55319149]
Intercept: -52074.46808510646

Predicted Prices: [399414.89361702 295265.95744681]

Actual Prices: [400000 250000]
