In [9]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt


In [None]:
# Load the dataset
data = pd.read_csv('dataset.csv')

# Display the first few rows of the dataset
data.head()


In [None]:
# Get basic information about the dataset
data.info()

# Check for missing values
data.isnull().sum()

# Check basic statistics of the numeric columns
data.describe()

# Visualize the distribution of the target variable
plt.figure(figsize=(6, 4))
sns.countplot(data['Survival_Rate'])
plt.title("Survival Rate Distribution")
plt.show()


In [None]:
# Encoding categorical columns using LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to categorical columns (adjust according to your dataset)
data['Country'] = label_encoder.fit_transform(data['Country'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Tobacco_Use'] = label_encoder.fit_transform(data['Tobacco_Use'].astype(str))
data['Alcohol_Use'] = label_encoder.fit_transform(data['Alcohol_Use'].astype(str))
data['Socioeconomic_Status'] = label_encoder.fit_transform(data['Socioeconomic_Status'])
data['Diagnosis_Stage'] = label_encoder.fit_transform(data['Diagnosis_Stage'])
data['Treatment_Type'] = label_encoder.fit_transform(data['Treatment_Type'])
data['HPV_Related'] = label_encoder.fit_transform(data['HPV_Related'].astype(str))

# Check for missing values again after encoding
data.isnull().sum()


In [None]:
# Define features and target variable
X = data.drop(columns=['Survival_Rate'])  # Features (dropping the target column)
y = data['Survival_Rate']  # Target variable (Survival Rate)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the training and testing data
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


In [None]:
# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the first few rows of the scaled data
pd.DataFrame(X_train, columns=X.columns).head()

In [None]:
# Initialize the Random Forest model
model = RandomForestClassifier(random_state=42)

# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions using the test data
y_pred = model.predict(X_test)


In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Print confusion matrix
print('Confusion Matrix:')
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# Plot confusion matrix using seaborn heatmap
plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Low Survival", "High Survival"], yticklabels=["Low Survival", "High Survival"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# Get feature importances from the model
feature_importances = model.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title("Feature Importance")
plt.show()


In [None]:
# Example: Making a prediction for a new input (replace with actual input data)
new_data = [[1, 2, 34, 1, 1, 3, 0, 1, 2, 0]]  # Example input (adjust according to your features)
new_data_scaled = scaler.transform(new_data)
prediction = model.predict(new_data_scaled)
print(f"Predicted Survival Rate: {prediction[0]}")


In [None]:
# Plot relationship between Age and Survival Rate
plt.figure(figsize=(8, 5))
sns.scatterplot(data=data, x='Age', y='Survival_Rate', hue='Survival_Rate', palette='viridis')
plt.title("Age vs Survival Rate")
plt.show()
