# Machine Learning Model Analysis and Prediction
This notebook loads a dataset, trains a machine learning model, evaluates its performance, and makes predictions on a test set. 
Results are visualized to provide insights into model predictions and performance.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

## Data Loading

In [None]:
# Load datasets
train_set = pd.read_csv("training_data.csv")
test_set = pd.read_csv("testing_data.csv")

# Display the first few rows of the training set
train_set.head()

## Data Exploration

In [None]:
# Overview of training data
print("Training Data Info:")
train_set.info()
print("\nTraining Data Description:")
train_set.describe()

# Check class distribution
sns.countplot(x='Label', data=train_set)
plt.title("Label Distribution in Training Data")
plt.show()

## Data Preprocessing

In [None]:
# Separate features and target
X_train = train_set.drop(columns='Label')
y_train = train_set['Label']

# Prepare test set (if labeled, split accordingly)
X_test = test_set

## Model Training and Evaluation

In [None]:
# Initialize and train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Average CV accuracy:", np.mean(cv_scores))

In [None]:
# Evaluate on the test set (if labeled)
# predictions = model.predict(X_test)  # Uncomment if test set has labels

# Save model
joblib.dump(model, "trained_model.joblib")

## Prediction and Visualization

In [None]:
# Load model and predict
model = joblib.load("trained_model.joblib")
predictions = model.predict(X_test)

# Visualization of prediction distribution
sns.countplot(x=predictions)
plt.title("Prediction Distribution on Test Set")
plt.show()

## Summary of Results
This notebook demonstrates data loading, exploration, training, and prediction. 
The Random Forest model's accuracy and other performance metrics were obtained using cross-validation, 
and predictions were visualized to show their distribution across classes.