In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Read the excel file into a pandas dataframe
df = pd.read_excel("/content/merged (1).xlsx")

# Split the dataframe into features (X) and labels (y)
X = df["Sentences"]
y = df[["Black", "Red", "Grey"]]

# Convert the labels into a single column with numerical values
# 0 for Black, 1 for Red, 2 for Grey
y = y.idxmax(axis=1).map({"Black": 0, "Red": 1, "Grey": 2})

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the sentences using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Create and fit a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Print the accuracy score and the confusion matrix
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Predict the probabilities for a new sentence
new_sentence = "第五条　全国人民代表大会和归侨人数较多地区的地方人民代表大会应当有适当名额的归侨代表。"
new_vector = vectorizer.transform([new_sentence])
probs = clf.predict_proba(new_vector)
print("Probabilities for Black, Red and Grey:", probs)

KeyError: ignored