In [None]:
# Install necessary libraries
!pip install pandas numpy matplotlib seaborn scikit-learn

# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor  # Importing a model for demonstration

# Load the CSV file
df = pd.read_csv(r"C:\Users\Chidis\AppData\Local\Temp\b7492cd5-bc87-467b-be31-17cdc181a838_archive (2).zip.838\worldometer_data.csv")

# Fill missing values with 0
df.fillna(0, inplace=True)

# Clean column names
df.columns = df.columns.str.replace(' ', '_').str.lower()

# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Calculate mortality rate and recovery rate
df['mortality_rate'] = (df['totaldeaths'] / df['totalcases']) * 100
df['recovery_rate'] = (df['totalrecovered'] / df['totalcases']) * 100

# Calculate cases and deaths per million
df['cases_per_million'] = df['totalcases'] / df['population'] * 1e6
df['deaths_per_million'] = df['totaldeaths'] / df['population'] * 1e6

# Plot total cases by continent
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='continent', y='totalcases', ci=None)
plt.title('Total Cases by Continent')
plt.xticks(rotation=45)
plt.show()

# Select only numeric columns for correlation
numeric_data = df.select_dtypes(include=['float64', 'int64'])

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Scatter plot of mortality vs recovery rates
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='mortality_rate', y='recovery_rate', hue='continent')
plt.title('Mortality vs. Recovery Rates')
plt.show()

# Select features and target variable
features = df[['population', 'totalcases', 'activecases', 'serious,critical', 'totaltests']] 

target = df['mortality_rate']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Fit a regression model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  # Take square root of MSE
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Scatter plot for actual vs predicted mortality rates
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.title('Actual vs Predicted Mortality Rates')
plt.xlabel('Actual Mortality Rate')
plt.ylabel('Predicted Mortality Rate')
plt.show()

# Feature importance
feature_importance = model.feature_importances_
feature_names = features.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance, y=feature_names)
plt.title('Feature Importance')
plt.show()

# Save processed data
df.to_csv('cleaned_data.csv', index=False)


In [None]:
# Install necessary libraries
!pip install pandas numpy matplotlib seaborn scikit-learn

# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor  # Importing a model for demonstration

# Load the CSV file
df = pd.read_csv(r"C:\Users\Chidis\AppData\Local\Temp\b7492cd5-bc87-467b-be31-17cdc181a838_archive (2).zip.838\worldometer_data.csv")

# Fill missing values with 0
df.fillna(0, inplace=True)

# Clean column names
df.columns = df.columns.str.replace(' ', '_').str.lower()

# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Calculate mortality rate and recovery rate
df['mortality_rate'] = (df['totaldeaths'] / df['totalcases']) * 100
df['recovery_rate'] = (df['totalrecovered'] / df['totalcases']) * 100

# Calculate cases and deaths per million
df['cases_per_million'] = df['totalcases'] / df['population'] * 1e6
df['deaths_per_million'] = df['totaldeaths'] / df['population'] * 1e6

# Plot total cases by continent
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='continent', y='totalcases', ci=None)
plt.title('Total Cases by Continent')
plt.xticks(rotation=45)
plt.show()

# Select only numeric columns for correlation
numeric_data = df.select_dtypes(include=['float64', 'int64'])

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Scatter plot of mortality vs recovery rates
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='mortality_rate', y='recovery_rate', hue='continent')
plt.title('Mortality vs. Recovery Rates')
plt.show()

# Select features and target variable
features = df[['population', 'totalcases', 'activecases', 'serious,critical', 'totaltests']] 

target = df['mortality_rate']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Fit a regression model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  # Take square root of MSE
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Scatter plot for actual vs predicted mortality rates
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.title('Actual vs Predicted Mortality Rates')
plt.xlabel('Actual Mortality Rate')
plt.ylabel('Predicted Mortality Rate')
plt.show()

# Feature importance
feature_importance = model.feature_importances_
feature_names = features.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance, y=feature_names)
plt.title('Feature Importance')
plt.show()

# Save processed data
df.to_csv('cleaned_data.csv', index=False)
