In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the dataset into a Pandas DataFrame
df = pd.read_csv('part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv')

In [None]:
# Separate features (X) and target variable (y)
X = df.drop(columns=["label"])  # Remove the target column 'label' to create feature matrix
y = df["label"]  # Select the target column 'label'

# Encode the target variable into numeric labels
print("Unique values before encoding:", y.unique())  # Display unique values in the target column
label_encoder = LabelEncoder()  # Initialize label encoder
df['label'] = label_encoder.fit_transform(df['label'])  # Encode target column into numbers
y_encoded = df['label']  # Use the encoded labels as the new target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
# Reserving 20% of the data for testing, and 80% for training
# stratify=y_encoded ensures class distribution is maintained between training and testing sets
# random_state=42 ensures reproducibility of the split

# Standardizing the features for better model performance
scaler = StandardScaler()  # Initialising a scaler for standardization
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform training data
X_test_scaled = scaler.transform(X_test)  # Transform testing data using the same scaler


In [None]:
# Display the mapping between original labels and encoded labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

In [None]:
# Train a Random Forest classifier on the training data
rf_model = RandomForestClassifier(random_state=42)  # Initialize the model with a fixed random state
rf_model.fit(X_train_scaled, y_train)  # Train the model on the standardized training data

# Evaluate the model's performance on the testing data
y_pred = rf_model.predict(X_test_scaled)  # Predict the target values for the test set
accuracy = accuracy_score(y_test, y_pred)  # Calculates the model's accuracy
print("Accuracy:", accuracy)

In [None]:
# Generate a classification report with precision, recall, F1-score, and support for each class
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Creating a confusion matrix to visualize model predictions vs actual values
plt.figure(figsize=(15, 15))  # Adjust the figure size for better readability
conf_matrix = confusion_matrix(y_test, y_pred)  # Generate the confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
# Annotate the confusion matrix
plt.xlabel("Predicted")  # Label the x-axis as Predicted values
plt.ylabel("Actual")  # Label the y-axis as Actual values
plt.title("Confusion Matrix")  # Add a title to the heatmap
plt.show()  # Display the heatmap