Upload the Dataset

In [None]:
from google.colab import files
uploaded = files.upload()

Saving cybersecurity-ai-in-2022-2023.ipynb to cybersecurity-ai-in-2022-2023.ipynb


Load the Dataset

In [None]:
import pandas as pd

# Function to load the dataset
def load_dataset(file_path):
    try:
        # Load the dataset into a pandas DataFrame
        dataset = pd.read_csv(file_path)

        # Check the first few rows to inspect
        print("Dataset loaded successfully!")
        print(dataset.head())  # Print first few rows of the dataset

        # Return the loaded dataset for further processing
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Example usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
dataset = load_dataset(file_path)


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


 Data Exploration


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk import FreqDist
from wordcloud import WordCloud

# Load the dataset (CSV)
def load_dataset(file_path):
    try:
        # Load the dataset into a pandas DataFrame
        dataset = pd.read_csv(file_path)

        print("Dataset loaded successfully!")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Basic Data Exploration
def basic_exploration(dataset):
    # Display basic information about the dataset
    print("Dataset Info:")
    print(dataset.info())  # Data types, missing values, etc.

    # Descriptive statistics (for numerical columns)
    print("\nDescriptive Statistics:")
    print(dataset.describe())

    # Check for missing values
    print("\nMissing values per column:")
    print(dataset.isnull().sum())

    # Show the first few rows to understand the structure
    print("\nFirst few rows of the dataset:")
    print(dataset.head())

# Text Exploration: Word Frequency & WordCloud
def text_exploration(dataset):
    # Tokenize and clean user queries and bot responses
    nltk.download('punkt')  # Make sure to have NLTK data downloaded
    from nltk.tokenize import word_tokenize

    # Tokenizing the user queries
    user_queries = dataset['User_Query'].dropna()  # Drop any NaN values

    # Flatten the list of tokenized words (ignoring case)
    all_words = []
    for query in user_queries:
        tokens = word_tokenize(query.lower())  # Lowercase for uniformity
        all_words.extend(tokens)

    # Frequency Distribution of Words
    fdist = FreqDist(all_words)
    print("\nTop 10 most frequent words in user queries:")
    print(fdist.most_common(10))

    # Create a word cloud for better visualization
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(fdist)

    # Display word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# Intent Distribution Analysis
def intent_analysis(dataset):
    # Count the occurrences of each intent type
    intent_counts = dataset['Intent'].value_counts()

    # Visualize the intent distribution
    plt.figure(figsize=(10, 6))
    sns.barplot(x=intent_counts.index, y=intent_counts.values, palette='viridis')
    plt.title('Intent Distribution in Customer Queries')
    plt.xlabel('Intent')
    plt.ylabel('Number of Queries')
    plt.xticks(rotation=45)
    plt.show()

# Response Length Analysis
def response_length_analysis(dataset):
    # Calculate the length of each bot's response
    dataset['Response_Length'] = dataset['Bot_Response'].apply(lambda x: len(str(x).split()))  # Word count

    # Plot distribution of response lengths
    plt.figure(figsize=(10, 6))
    sns.histplot(dataset['Response_Length'], kde=True, color='blue', bins=20)
    plt.title('Distribution of Bot Response Lengths')
    plt.xlabel('Response Length (in words)')
    plt.ylabel('Frequency')
    plt.show()

# Main program for exploration
def explore_customer_support_chatbot(file_path):
    # Load the dataset
    dataset = load_dataset(file_path)
    if dataset is None:
        return

    # Basic Data Exploration
    basic_exploration(dataset)

    # Text Exploration (Word Frequency & WordCloud)
    text_exploration(dataset)

    # Intent Analysis (Distribution of Intents)
    intent_analysis(dataset)

    # Bot Response Length Distribution
    response_length_analysis(dataset)

# Example Usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
explore_customer_support_chatbot(file_path)


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


Check for Missing Values and Duplicates


In [None]:
import pandas as pd

# Function to load the dataset
def load_dataset(file_path):
    try:
        # Load the dataset into a pandas DataFrame
        dataset = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Check for Missing Values
def check_missing_values(dataset):
    # Check for missing values in each column
    missing_values = dataset.isnull().sum()
    print("\nMissing values per column:")
    print(missing_values)

    # Percentage of missing values for each column
    missing_percentage = (missing_values / len(dataset)) * 100
    print("\nPercentage of missing values per column:")
    print(missing_percentage)

    # If you want to filter columns with more than 50% missing values, for example:
    columns_with_missing = missing_percentage[missing_percentage > 50]
    if not columns_with_missing.empty:
        print("\nColumns with more than 50% missing values:")
        print(columns_with_missing)
    else:
        print("\nNo columns with more than 50% missing values.")

# Check for Duplicates
def check_duplicates(dataset):
    # Check for duplicate rows
    duplicate_rows = dataset.duplicated().sum()
    print(f"\nNumber of duplicate rows: {duplicate_rows}")

    # Check for duplicate entries based on 'User_Query' and 'Bot_Response' (customizable)
    duplicate_entries = dataset.duplicated(subset=['User_Query', 'Bot_Response']).sum()
    print(f"\nNumber of duplicate entries based on 'User_Query' and 'Bot_Response': {duplicate_entries}")

    # Optionally, view the first few duplicate rows:
    if duplicate_entries > 0:
        print("\nSome duplicate entries:")
        print(dataset[dataset.duplicated(subset=['User_Query', 'Bot_Response'])].head())

# Handle Missing Values (Optional)
def handle_missing_values(dataset):
    # Example: Drop rows with any missing values
    cleaned_dataset = dataset.dropna()
    print(f"\nRows dropped with missing values: {len(dataset) - len(cleaned_dataset)}")
    print(f"Shape of dataset after dropping missing values: {cleaned_dataset.shape}")

    # Example: Fill missing values with a placeholder (e.g., 'Unknown' or a default response)
    # cleaned_dataset = dataset.fillna({'User_Query': 'Unknown', 'Bot_Response': 'No Response'})

    return cleaned_dataset

# Main function to check for missing values and duplicates
def check_missing_and_duplicates(file_path):
    # Load the dataset
    dataset = load_dataset(file_path)
    if dataset is None:
        return

    # Check for missing values
    check_missing_values(dataset)

    # Check for duplicates
    check_duplicates(dataset)

    # Optionally, handle missing values (drop or fill)
    dataset = handle_missing_values(dataset)

    # Return the cleaned dataset if needed
    return dataset

# Example Usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
cleaned_dataset = check_missing_and_duplicates(file_path)


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


Visualize a Few Features


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize

# Function to load the dataset
def load_dataset(file_path):
    try:
        # Load the dataset into a pandas DataFrame
        dataset = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# 1. Visualize User Query Length Distribution
def visualize_user_query_length(dataset):
    # Calculate the length of each user query (in terms of word count)
    dataset['User_Query_Length'] = dataset['User_Query'].apply(lambda x: len(str(x).split()))

    # Plot the distribution of user query lengths
    plt.figure(figsize=(10, 6))
    sns.histplot(dataset['User_Query_Length'], kde=True, color='blue', bins=30)
    plt.title('Distribution of User Query Lengths')
    plt.xlabel('Length of User Query (in words)')
    plt.ylabel('Frequency')
    plt.show()

# 2. Visualize Bot Response Length Distribution
def visualize_bot_response_length(dataset):
    # Calculate the length of each bot's response (in terms of word count)
    dataset['Bot_Response_Length'] = dataset['Bot_Response'].apply(lambda x: len(str(x).split()))

    # Plot the distribution of bot response lengths
    plt.figure(figsize=(10, 6))
    sns.histplot(dataset['Bot_Response_Length'], kde=True, color='green', bins=30)
    plt.title('Distribution of Bot Response Lengths')
    plt.xlabel('Length of Bot Response (in words)')
    plt.ylabel('Frequency')
    plt.show()

# 3. Visualize Intent Distribution
def visualize_intent_distribution(dataset):
    # Count the occurrences of each intent type
    intent_counts = dataset['Intent'].value_counts()

    # Plot the distribution of intents
    plt.figure(figsize=(10, 6))
    sns.barplot(x=intent_counts.index, y=intent_counts.values, palette='viridis')
    plt.title('Intent Distribution in Customer Queries')
    plt.xlabel('Intent')
    plt.ylabel('Number of Queries')
    plt.xticks(rotation=45)
    plt.show()

# 4. Visualize Response Time Distribution (if Timestamp is available)
def visualize_response_time(dataset):
    # Convert the Timestamp to datetime (if not already in datetime format)
    dataset['Timestamp'] = pd.to_datetime(dataset['Timestamp'], errors='coerce')

    # Calculate the time difference between queries and responses (assuming it's available)
    # Here, we assume there's a 'Response_Time' column in seconds
    if 'Response_Time' in dataset.columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(dataset['Response_Time'], kde=True, color='orange', bins=30)
        plt.title('Distribution of Bot Response Times')
        plt.xlabel('Response Time (in seconds)')
        plt.ylabel('Frequency')
        plt.show()
    else:
        print("\n'RESPONSE_TIME' column not found in the dataset.")

# Main Function to Run All Visualizations
def visualize_features(file_path):
    # Load the dataset
    dataset = load_dataset(file_path)
    if dataset is None:
        return

    # Visualize User Query Length Distribution
    visualize_user_query_length(dataset)

    # Visualize Bot Response Length Distribution
    visualize_bot_response_length(dataset)

    # Visualize Intent Distribution
    visualize_intent_distribution(dataset)

    # Visualize Response Time Distribution (if available)
    visualize_response_time(dataset)

# Example Usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
visualize_features(file_path)


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


 Identify Target and Features


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Function to load the dataset
def load_dataset(file_path):
    try:
        # Load the dataset into a pandas DataFrame
        dataset = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Identify Target and Features
def identify_target_and_features(dataset, target_column='Intent'):
    # Ensure the target column exists in the dataset
    if target_column not in dataset.columns:
        print(f"Error: Target column '{target_column}' not found in the dataset!")
        return None, None

    # Define Features (all columns except the target column)
    features = dataset.drop(columns=[target_column])

    # Define Target
    target = dataset[target_column]

    # Optionally, you can convert text columns to numerical values (e.g., using TF-IDF for text data)
    # This could be handled after splitting features and target.

    return features, target

# Split Dataset into Training and Testing
def split_dataset(features, target):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")

    return X_train, X_test, y_train, y_test

# Main Function
def prepare_data_for_model(file_path, target_column='Intent'):
    # Load the dataset
    dataset = load_dataset(file_path)
    if dataset is None:
        return None, None, None, None

    # Identify features and target
    features, target = identify_target_and_features(dataset, target_column)
    if features is None or target is None:
        return None, None, None, None

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = split_dataset(features, target)

    return X_train, X_test, y_train, y_test

# Example Usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
X_train, X_test, y_train, y_test = prepare_data_for_model(file_path, target_column='Intent')

# After this, you can start building and training a model


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


 Convert Categorical Columns to Numerical

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Function to load the dataset
def load_dataset(file_path):
    try:
        # Load the dataset into a pandas DataFrame
        dataset = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# 1. Label Encoding (for Ordinal/Target Columns)
def label_encode_column(dataset, column_name):
    # Initialize LabelEncoder
    label_encoder = LabelEncoder()

    # Fit and transform the specified column
    dataset[column_name] = label_encoder.fit_transform(dataset[column_name])

    print(f"Label Encoding applied to {column_name}")
    return dataset

# 2. One-Hot Encoding (for Nominal Columns)
def one_hot_encode_column(dataset, column_name):
    # Apply OneHotEncoding to the specified column
    one_hot_encoded = pd.get_dummies(dataset[column_name], prefix=column_name)

    # Concatenate the OneHotEncoded columns to the original dataset and drop the original column
    dataset = pd.concat([dataset, one_hot_encoded], axis=1)
    dataset.drop(columns=[column_name], inplace=True)

    print(f"One-Hot Encoding applied to {column_name}")
    return dataset

# Function to apply encoding to necessary columns
def encode_categorical_columns(dataset):
    # Example encoding: Apply Label Encoding to 'User_Satisfaction' (target)
    if 'User_Satisfaction' in dataset.columns:
        dataset = label_encode_column(dataset, 'User_Satisfaction')

    # Example encoding: Apply One-Hot Encoding to 'Intent' (categorical feature)
    if 'Intent' in dataset.columns:
        dataset = one_hot_encode_column(dataset, 'Intent')

    # Apply One-Hot Encoding to 'Bot_Response' if needed (nominal category)
    if 'Bot_Response' in dataset.columns:
        dataset = one_hot_encode_column(dataset, 'Bot_Response')

    return dataset

# Main function to process the dataset
def process_categorical_columns(file_path):
    # Load the dataset
    dataset = load_dataset(file_path)
    if dataset is None:
        return None

    # Encode categorical columns
    dataset = encode_categorical_columns(dataset)

    # Return the processed dataset
    return dataset

# Example Usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
processed_dataset = process_categorical_columns(file_path)

# View the processed dataset
if processed_dataset is not None:
    print(processed_dataset.head())


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


 One-Hot Encoding


In [None]:
import pandas as pd

# Function to load the dataset
def load_dataset(file_path):
    try:
        # Load the dataset into a pandas DataFrame
        dataset = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Function for One-Hot Encoding
def one_hot_encode_columns(dataset, columns_to_encode):
    # Apply One-Hot Encoding to the specified columns
    for column in columns_to_encode:
        one_hot_encoded = pd.get_dummies(dataset[column], prefix=column)
        dataset = pd.concat([dataset, one_hot_encoded], axis=1)
        dataset.drop(columns=[column], inplace=True)
        print(f"One-Hot Encoding applied to {column}")

    return dataset

# Main Function to process the dataset
def process_one_hot_encoding(file_path, columns_to_encode):
    # Load the dataset
    dataset = load_dataset(file_path)
    if dataset is None:
        return None

    # Apply One-Hot Encoding to specified columns
    dataset = one_hot_encode_columns(dataset, columns_to_encode)

    # Return the processed dataset
    return dataset

# Example Usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
columns_to_encode = ['Intent', 'User_Satisfaction']  # Example: columns to apply One-Hot Encoding

# Process the dataset with One-Hot Encoding
processed_dataset = process_one_hot_encoding(file_path, columns_to_encode)

# View the processed dataset
if processed_dataset is not None:
    print(processed_dataset.head())


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


Feature Scaling

In [None]:
import pandas as pd

# Function to load the dataset
def load_dataset(file_path):
    try:
        # Load the dataset into a pandas DataFrame
        dataset = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Function for One-Hot Encoding
def one_hot_encode_columns(dataset, columns_to_encode):
    # Apply One-Hot Encoding to the specified columns
    for column in columns_to_encode:
        one_hot_encoded = pd.get_dummies(dataset[column], prefix=column)
        dataset = pd.concat([dataset, one_hot_encoded], axis=1)
        dataset.drop(columns=[column], inplace=True)
        print(f"One-Hot Encoding applied to {column}")

    return dataset

# Main Function to process the dataset
def process_one_hot_encoding(file_path, columns_to_encode):
    # Load the dataset
    dataset = load_dataset(file_path)
    if dataset is None:
        return None

    # Apply One-Hot Encoding to specified columns
    dataset = one_hot_encode_columns(dataset, columns_to_encode)

    # Return the processed dataset
    return dataset

# Example Usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
columns_to_encode = ['Intent', 'User_Satisfaction']  # Example: columns to apply One-Hot Encoding

# Process the dataset with One-Hot Encoding
processed_dataset = process_one_hot_encoding(file_path, columns_to_encode)

# View the processed dataset
if processed_dataset is not None:
    print(processed_dataset.head())


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


Train-Test Split


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Function to load the dataset
def load_dataset(file_path):
    try:
        # Load the dataset into a pandas DataFrame
        dataset = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Function to split the dataset into Train and Test sets
def split_dataset(dataset, target_column, test_size=0.2, random_state=42):
    # Separate features (X) and target (y)
    X = dataset.drop(columns=[target_column])  # Features (everything except target column)
    y = dataset[target_column]  # Target (the column you're trying to predict)

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    print(f"Dataset split into {len(X_train)} training samples and {len(X_test)} test samples.")
    return X_train, X_test, y_train, y_test

# Main function to process and split the dataset
def process_train_test_split(file_path, target_column, test_size=0.2):
    # Load the dataset
    dataset = load_dataset(file_path)
    if dataset is None:
        return None, None, None, None

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = split_dataset(dataset, target_column, test_size)

    # Return the split datasets
    return X_train, X_test, y_train, y_test

# Example Usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
target_column = "User_Satisfaction"  # Replace with your actual target column (e.g., "User_Satisfaction")
test_size = 0.2  # 20% of the data will be used for testing

# Process the dataset and split it into Train and Test sets
X_train, X_test, y_train, y_test = process_train_test_split(file_path, target_column, test_size)

# Display the first few rows of the train and test sets
if X_train is not None:
    print("Training Features (X_train):")
    print(X_train.head())
    print("\nTraining Target (y_train):")
    print(y_train.head())

    print("\nTest Features (X_test):")
    print(X_test.head())
    print("\nTest Target (y_test):")
    print(y_test.head())


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


 Model Building


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Function to load the dataset
def load_dataset(file_path):
    try:
        dataset = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Function to preprocess the data (One-Hot Encoding and Feature Scaling)
def preprocess_data(dataset, target_column):
    # Separate features and target
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]

    # Apply one-hot encoding on categorical columns (e.g., 'Intent', 'Bot_Response')
    X = pd.get_dummies(X)

    # Feature Scaling: Standardize numeric columns
    numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
    scaler = StandardScaler()
    X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

    return X, y

# Function to split the dataset into train and test sets
def split_dataset(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to build and train the model
def build_and_train_model(X_train, y_train, model_type='logistic_regression'):
    if model_type == 'logistic_regression':
        model = LogisticRegression(max_iter=1000)  # Logistic Regression model
    elif model_type == 'random_forest':
        model = RandomForestClassifier(n_estimators=100, random_state=42)  # Random Forest model
    else:
        raise ValueError("Unsupported model type. Use 'logistic_regression' or 'random_forest'.")

    # Train the model
    model.fit(X_train, y_train)
    return model

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    # Predict the target variable using the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")

    # Print classification report and confusion matrix
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Main function to load, process, build, train, and evaluate the model
def model_building_and_evaluation(file_path, target_column, model_type='logistic_regression'):
    # Load dataset
    dataset = load_dataset(file_path)
    if dataset is None:
        return

    # Preprocess the data
    X, y = preprocess_data(dataset, target_column)

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = split_dataset(X, y)

    # Build and train the model
    model = build_and_train_model(X_train, y_train, model_type)

    # Evaluate the model
    evaluate_model(model, X_test, y_test)

# Example Usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
target_column = "User_Satisfaction"  # Replace with your actual target column
model_type = 'random_forest'  # Choose either 'logistic_regression' or 'random_forest'

# Build, train, and evaluate the model
model_building_and_evaluation(file_path, target_column, model_type)


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


 Evaluation


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Function to load the dataset
def load_dataset(file_path):
    try:
        dataset = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Function to preprocess the data (One-Hot Encoding and Feature Scaling)
def preprocess_data(dataset, target_column):
    # Separate features and target
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]

    # Apply one-hot encoding on categorical columns (e.g., 'Intent', 'Bot_Response')
    X = pd.get_dummies(X)

    # Feature Scaling: Standardize numeric columns
    numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

    return X, y

# Function to split the dataset into train and test sets
def split_dataset(X, y, test_size=0.2, random_state=42):
    from sklearn.model_selection import train_test_split
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    # Predict the target variable using the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")

    # Print classification report (precision, recall, f1-score)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Print confusion matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    # Plot the confusion matrix
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Dissatisfied', 'Satisfied'], yticklabels=['Dissatisfied', 'Satisfied'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

    # If it's binary classification, plot ROC Curve and calculate AUC
    if len(set(y_test)) == 2:
        # Compute ROC AUC
        y_prob = model.predict_proba(X_test)[:, 1]  # Probabilities of positive class
        auc = roc_auc_score(y_test, y_prob)
        print(f"ROC-AUC: {auc:.4f}")

        # Plot ROC curve
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure(figsize=(6, 6))
        plt.plot(fpr, tpr, color='blue', label=f'ROC curve (area = {auc:.4f})')
        plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc='lower right')
        plt.show()

# Main function to load, process, build, train, and evaluate the model
def model_building_and_evaluation(file_path, target_column, model_type='logistic_regression'):
    # Load dataset
    dataset = load_dataset(file_path)
    if dataset is None:
        return

    # Preprocess the data
    X, y = preprocess_data(dataset, target_column)

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = split_dataset(X, y)

    # Build and train the model
    model = build_and_train_model(X_train, y_train, model_type)

    # Evaluate the model
    evaluate_model(model, X_test, y_test)

# Example Usage
file_path = "customer_support_chatbot.csv"  # Replace with your actual file path
target_column = "User_Satisfaction"  # Replace with your actual target column
model_type = 'random_forest'  # Choose either 'logistic_regression' or 'random_forest'

# Build, train, and evaluate the model
model_building_and_evaluation(file_path, target_column, model_type)


Error loading dataset: [Errno 2] No such file or directory: 'customer_support_chatbot.csv'


Make Predictions from New Input

In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

# Function to load the trained model
def load_trained_model(model_file_path):
    try:
        model = joblib.load(model_file_path)
        print("Model loaded successfully!")
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Function to preprocess the new input data (same steps as in training)
def preprocess_new_input(new_data, model, scaler, encoder):
    # Apply one-hot encoding to the categorical columns (same as training data preprocessing)
    new_data = pd.get_dummies(new_data)

    # Align the new data with the model's expected input format
    missing_columns = set(model.feature_names_in_) - set(new_data.columns)
    for col in missing_columns:
        new_data[col] = 0  # Add missing columns with default value 0
    new_data = new_data[model.feature_names_in_]

    # Scale the new input data using the scaler
    new_data = scaler.transform(new_data)

    return new_data

# Function to make predictions on new input
def make_prediction(new_data, model, scaler, encoder):
    # Preprocess the new input data
    processed_data = preprocess_new_input(new_data, model, scaler, encoder)

    # Make a prediction using the trained model
    prediction = model.predict(processed_data)

    # Output the prediction
    if prediction[0] == 1:
        print("Predicted Output: Satisfied")
    else:
        print("Predicted Output: Dissatisfied")

# Main function to load the model and make predictions
def predict_from_new_input(model_file_path, scaler_file_path, encoder_file_path, new_input):
    # Load the trained model, scaler, and encoder
    model = load_trained_model(model_file_path)
    if model is None:
        return

    # Load the scaler (if used for feature scaling during training)
    scaler = joblib.load(scaler_file_path)

    # Load the encoder (if used for categorical encoding during training)
    encoder = joblib.load(encoder_file_path)

    # Call the function to make predictions
    make_prediction(new_input, model, scaler, encoder)

# Example usage
new_input_data = pd.DataFrame({
    'Message_Length': [150],       # New query's message length
    'Response_Time': [30],         # Response time
    'Intent': ['Query'],           # User intent (ensure this matches with how the model was trained)
    'User_Query': ['How can I track my order?']  # New user query (or any other feature used in the model)
})

# File paths to the saved model, scaler, and encoder
model_file = 'chatbot_model.joblib'  # Replace with your model file path
scaler_file = 'scaler.joblib'        # Replace with your scaler file path
encoder_file = 'encoder.joblib'      # Replace with your encoder file path

# Make predictions from new input
predict_from_new_input(model_file, scaler_file, encoder_file, new_input_data)


Error loading model: [Errno 2] No such file or directory: 'chatbot_model.joblib'


Convert to DataFrame and Encode


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Function to convert raw data to DataFrame
def convert_to_dataframe(raw_data):
    """
    Convert raw data (list or dictionary) to pandas DataFrame.

    Parameters:
        raw_data (list or dict): Raw data to be converted into DataFrame

    Returns:
        pd.DataFrame: The raw data as a pandas DataFrame
    """
    # Check if raw_data is a list of dictionaries or a dictionary of lists
    if isinstance(raw_data, list):
        return pd.DataFrame(raw_data)
    elif isinstance(raw_data, dict):
        return pd.DataFrame.from_dict(raw_data)
    else:
        raise ValueError("Raw data should be either a list of dictionaries or a dictionary of lists.")

# Function to encode categorical features
def encode_data(df, categorical_columns):
    """
    Encode categorical columns in the DataFrame using Label Encoding or One-Hot Encoding.

    Parameters:
        df (pd.DataFrame): DataFrame containing the data
        categorical_columns (list): List of categorical columns to encode

    Returns:
        pd.DataFrame: DataFrame with encoded categorical columns
    """
    # For categorical features, apply one-hot encoding or label encoding
    encoded_df = df.copy()

    # Apply Label Encoding for certain columns (e.g., 'User_Satisfaction')
    label_encoder = LabelEncoder()

    for column in categorical_columns:
        if encoded_df[column].dtype == 'object':  # Check if the column is categorical
            encoded_df[column] = label_encoder.fit_transform(encoded_df[column])

    return encoded_df

# Sample raw data (e.g., user queries, intents, satisfaction levels)
raw_data = [
    {"User_Query": "How can I reset my password?", "Intent": "Password Reset", "Response_Time": 10, "User_Satisfaction": "Satisfied"},
    {"User_Query": "Where is my order?", "Intent": "Order Inquiry", "Response_Time": 5, "User_Satisfaction": "Dissatisfied"},
    {"User_Query": "How to change my email address?", "Intent": "Account Update", "Response_Time": 7, "User_Satisfaction": "Satisfied"},
    {"User_Query": "I need help with shipping", "Intent": "Shipping Inquiry", "Response_Time": 12, "User_Satisfaction": "Dissatisfied"},
]

# Convert raw data to DataFrame
df = convert_to_dataframe(raw_data)

# Print the original DataFrame
print("Original DataFrame:")
print(df)

# List of categorical columns to encode (e.g., 'Intent', 'User_Satisfaction')
categorical_columns = ['Intent', 'User_Satisfaction']

# Encode the categorical data
encoded_df = encode_data(df, categorical_columns)

# Print the encoded DataFrame
print("\nEncoded DataFrame:")
print(encoded_df)


Original DataFrame:
                        User_Query            Intent  Response_Time  \
0     How can I reset my password?    Password Reset             10   
1               Where is my order?     Order Inquiry              5   
2  How to change my email address?    Account Update              7   
3        I need help with shipping  Shipping Inquiry             12   

  User_Satisfaction  
0         Satisfied  
1      Dissatisfied  
2         Satisfied  
3      Dissatisfied  

Encoded DataFrame:
                        User_Query  Intent  Response_Time  User_Satisfaction
0     How can I reset my password?       2             10                  1
1               Where is my order?       1              5                  0
2  How to change my email address?       0              7                  1
3        I need help with shipping       3             12                  0


 Predict the Final Grade

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import joblib

# Function to create raw data in a DataFrame
def create_dataframe():
    # Manually create a dataset (no CSV required)
    data = {
        'Response_Time': [10, 5, 7, 12, 8],  # Example response times (in minutes)
        'Message_Length': [150, 200, 180, 120, 160],  # Message length (e.g., number of characters)
        'User_Satisfaction': ['Satisfied', 'Dissatisfied', 'Satisfied', 'Dissatisfied', 'Satisfied'],  # Satisfaction labels
        'Final_Grade': [90, 60, 85, 50, 80]  # Final grade (target variable, e.g., satisfaction score)
    }

    # Convert to pandas DataFrame
    df = pd.DataFrame(data)
    return df

# Function to preprocess data (encode categorical columns)
def preprocess_data(df):
    # Initialize the LabelEncoder
    label_encoder = LabelEncoder()

    # Fit and transform the 'User_Satisfaction' column in the training data
    df['User_Satisfaction'] = label_encoder.fit_transform(df['User_Satisfaction'])

    return df, label_encoder

# Function to train the model
def train_model(df):
    # Define the features (X) and the target variable (y)
    X = df[['Response_Time', 'Message_Length', 'User_Satisfaction']]  # Features
    y = df['Final_Grade']  # Target variable (Final grade/satisfaction score)

    # Split the data into training and testing sets (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train a Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the target variable on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model using Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')

    return model

# Function to predict the final grade for new data (unseen data)
def predict_final_grade(model, label_encoder, new_data):
    # Transform the 'User_Satisfaction' of new data using the fitted label_encoder
    new_data['User_Satisfaction'] = label_encoder.transform(new_data['User_Satisfaction'])

    # Predict the final grade for the new data (for example, new customer interaction)
    prediction = model.predict(new_data[['Response_Time', 'Message_Length', 'User_Satisfaction']])
    print(f'Predicted Final Grade: {prediction[0]}')  # Print the predicted grade

# Main function to execute the program
def main():
    # Step 1: Create and preprocess the dataset
    df = create_dataframe()

    # Preprocess the data and fit the label encoder
    df, label_encoder = preprocess_data(df)

    # Step 2: Train the model on the data
    model = train_model(df)

    # Step 3: Use the trained model to predict the final grade for new data
    # Example new input data (for new customer interaction)
    new_input_data = pd.DataFrame({
        'Response_Time': [9],  # New response time for the customer interaction
        'Message_Length': [160],  # New message length for the customer interaction
        'User_Satisfaction': ['Satisfied']  # New satisfaction level
    })

    # Step 4: Predict the final grade for the new data
    predict_final_grade(model, label_encoder, new_input_data)

    # Optionally, save the trained model for later use
    joblib.dump(model, 'chatbot_final_grade_predictor.joblib')

# Run the main function
if __name__ == '__main__':
    main()


Mean Squared Error: 224.99999999999915
Predicted Final Grade: 88.33333333333334


 Deployment-Building an Interactive App


In [None]:
import streamlit as st
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

# Step 1: Create a sample dataset using Pandas
def create_sample_data():
    # Sample Data: response time, message length, user satisfaction, and final grade
    data = {
        'Response_Time': [10, 5, 7, 12, 8, 6, 9, 11, 14, 6],
        'Message_Length': [150, 200, 180, 120, 160, 175, 190, 140, 130, 160],
        'User_Satisfaction': ['Satisfied', 'Dissatisfied', 'Satisfied', 'Dissatisfied', 'Satisfied',
                              'Dissatisfied', 'Satisfied', 'Satisfied', 'Dissatisfied', 'Satisfied'],
        'Final_Grade': [90, 60, 85, 50, 80, 65, 88, 92, 55, 77]
    }
    # Convert to a Pandas DataFrame
    df = pd.DataFrame(data)
    return df

# Step 2: Preprocess the data
def preprocess_data(df):
    # Label encode the User Satisfaction column
    label_encoder = LabelEncoder()
    df['User_Satisfaction'] = label_encoder.fit_transform(df['User_Satisfaction'])
    return df, label_encoder

# Step 3: Train the model (Linear Regression)
def train_model(df):
    # Select features (Response_Time, Message_Length, User_Satisfaction) and target (Final_Grade)
    X = df[['Response_Time', 'Message_Length', 'User_Satisfaction']]  # Features
    y = df['Final_Grade']  # Target variable

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict on the test set and calculate Mean Squared Error
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')

    return model

# Step 4: Predict the final grade based on user input
def predict_final_grade(model, label_encoder, response_time, message_length, user_satisfaction):
    # Prepare input data for prediction
    input_data = pd.DataFrame({
        'Response_Time': [response_time],
        'Message_Length': [message_length],
        'User_Satisfaction': [user_satisfaction]
    })

    # Encode the User Satisfaction feature
    input_data['User_Satisfaction'] = label_encoder.transform(input_data['User_Satisfaction'])

    # Predict using the trained model
    prediction = model.predict(input_data[['Response_Time', 'Message_Length', 'User_Satisfaction']])
    return prediction[0]

# Step 5: Build Streamlit interface
def run_app():
    st.title("Customer Support Chatbot - Predict Final Grade")

    # Step 1: Load and preprocess the data
    df = create_sample_data()
    df, label_encoder = preprocess_data(df)

    # Step 2: Train the model
    model = train_model(df)

    # Step 3: User input fields
    st.header("Enter Customer Interaction Details")

    response_time = st.number_input("Response Time (in minutes)", min_value=0, value=5)
    message_length = st.number_input("Message Length (in characters)", min_value=0, value=150)
    user_satisfaction = st.selectbox("User Satisfaction", ['Satisfied', 'Dissatisfied'])

    # Step 4: When the "Predict Final Grade" button is clicked, predict the grade
    if st.button('Predict Final Grade'):
        # Step 5: Predict the grade based on user input
        predicted_grade = predict_final_grade(model, label_encoder, response_time, message_length, user_satisfaction)
        st.subheader(f"Predicted Final Grade: {predicted_grade:.2f}")

# Run the Streamlit app
if __name__ == '__main__':
    run_app()


2025-05-07 09:15:21.763 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-05-07 09:15:21.787 Session state does not function when running a script without `streamlit run`


Mean Squared Error: 25.36353745059855


In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.45.0-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m87.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInst

Create a Prediction Function


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
import pickle

# Example DataFrame (this would be your actual data)
data = {
    'query': ['How can I reset my password?', 'I need a refund', 'Where is my order?', 'Help with billing issue'],
    'category': ['Account', 'Refund', 'Order', 'Billing'],
    'satisfaction': ['Positive', 'Negative', 'Neutral', 'Negative']
}

df = pd.DataFrame(data)

# Step 1: Preprocessing (vectorize text)
X = df['query']
y = df['category']  # Or 'satisfaction' depending on the task

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use TF-IDF for text feature extraction
model = make_pipeline(TfidfVectorizer(), RandomForestClassifier())

# Step 2: Train the model
model.fit(X_train, y_train)

# Step 3: Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Save the model for deployment
with open('chatbot_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Prediction function
def predict_category(query):
    # Load the model and predict
    with open('chatbot_model.pkl', 'rb') as f:
        model = pickle.load(f)
    return model.predict([query])[0]

# Example usage:
query = 'I want to track my order'
predicted_category = predict_category(query)
print(f"Predicted Category: {predicted_category}")


Accuracy: 0.0
Predicted Category: Order


Create the Gradio Interface

In [None]:
from sklearn.preprocessing import LabelEncoder

# Let's assume we have a model that predicts sentiment as well
sentiment_encoder = LabelEncoder()
sentiment_encoder.fit(['Positive', 'Negative', 'Neutral'])

def predict(query):
    category = model.predict([query])[0]
    sentiment = sentiment_encoder.inverse_transform(model_sentiment.predict([query]))[0]
    return category, sentiment

# Modify the Gradio interface to return two outputs
interface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(label="Enter Your Query", placeholder="Ask anything..."),
    outputs=[gr.Textbox(label="Predicted Category"), gr.Textbox(label="Predicted Sentiment")],
    live=True,
    title="Customer Support Chatbot",
    description="This is an intelligent chatbot that predicts the category and sentiment of your query."
)

interface.launch()


NameError: name 'gr' is not defined

In [None]:
!pip install gradio
import gradio as gr
from sklearn.preprocessing import LabelEncoder

# Assuming model and model_sentiment are defined and trained before this code
# (Consider saving and loading with pickle for persistence)
# Example:
# import pickle
# with open('chatbot_model.pkl', 'rb') as f:
#     model = pickle.load(f)
# with open('sentiment_model.pkl', 'rb') as f:
#     model_sentiment = pickle.load(f)

# Let's assume we have a model that predicts sentiment as well
sentiment_encoder = LabelEncoder()
sentiment_encoder.fit(['Positive', 'Negative', 'Neutral'])

def predict(query):
    category = model.predict([query])[0]
    sentiment = sentiment_encoder.inverse_transform(model_sentiment.predict([query]))[0] # Assuming 'model_sentiment' is your sentiment analysis model
    return category, sentiment

# Modify the Gradio interface to return two outputs
interface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(label="Enter Your Query", placeholder="Ask anything..."),
    outputs=[gr.Textbox(label="Predicted Category"), gr.Textbox(label="Predicted Sentiment")],
    live=True,
    title="Customer Support Chatbot",
    description="This is an intelligent chatbot that predicts the category and sentiment of your query."
)

interface.launch()

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

