In [None]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv('text.csv')

# Select only 'text' and 'label' columns
df = df[['text', 'label']]

# Inspect missing values
print("Missing values in 'text' column:", df['text'].isnull().sum())
print("Missing values in 'label' column:", df['label'].isnull().sum())

# Drop rows where 'text' is null
df = df.dropna(subset=['text'])

# Fill missing label entries with a random integer between 0 and 5
df['label'] = df['label'].apply(lambda x: np.random.randint(0, 6) if pd.isnull(x) else x)

# Identify and correct outliers in the label column
df['label'] = df['label'].apply(lambda x: np.random.randint(0, 6) 
                                if x not in range(6) else x)

# Re-inspect missing values after preprocessing
print("After preprocessing:")
print("Missing values in 'text' column:", df['text'].isnull().sum())
print("Missing values in 'label' column:", df['label'].isnull().sum())
print("Outlier values in 'label' column:", df[~df['label'].isin(range(6))].shape[0])

# Proceed with the rest of your workflow...
# Text preprocessing and vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(criterion='gini', random_state=42)

# Train the classifier
dt_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("MCC:", matthews_corrcoef(y_test, y_pred))

# Calculate Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("MSE:", mse)
print("RMSE:", rmse)

print("Classification Report:\n", 
      classification_report(y_test, y_pred, target_names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']))

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix using matplotlib
fig, ax = plt.subplots(figsize=(10, 7))
cax = ax.matshow(conf_matrix, cmap='Blues')
fig.colorbar(cax)

# Set axis labels
ax.set_xlabel('Predicted Labels')
ax.set_ylabel('True Labels')

# Set axis ticks and labels
ax.set_xticklabels([''] + ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'])
ax.set_yticklabels([''] + ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'])

# Rotate the tick labels and set their alignment.
plt.xticks(rotation=45, ha='right')

# Loop over data dimensions and create text annotations.
for i in range(len(conf_matrix)):
    for j in range(len(conf_matrix[i])):
        ax.text(j, i, conf_matrix[i, j], ha='center', va='center', color='black')

plt.title('Confusion Matrix')
plt.show()


Missing values in 'text' column: 0
Missing values in 'label' column: 0
After preprocessing:
Missing values in 'text' column: 0
Missing values in 'label' column: 0
Outlier values in 'label' column: 0


