In [2]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from geopy.distance import geodesic
import os

os.environ["LOKY_MAX_CPU_COUNT"] = "4"

train_data = pd.read_csv('/content/train_data.xlsx - Sheet1.csv')

train_data = train_data.dropna(subset=['Lat', 'Long_'])

outbreak_centroid = (train_data[['Lat', 'Long_']].median().values)
train_data['Distance_to_Centroid'] = train_data.apply(
    lambda row: geodesic((row['Lat'], row['Long_']), outbreak_centroid).km,
    axis=1
)

train_data['Confirmed_Cases'] = np.where(
    (train_data['Deaths'].notna()) &
    (train_data['Case_Fatality_Ratio'].notna()) &
    (train_data['Case_Fatality_Ratio'] != 0),
    (train_data['Deaths'] / train_data['Case_Fatality_Ratio']) * 100,
    np.nan
)

train_data.loc[train_data['Case_Fatality_Ratio'] > 100, 'Case_Fatality_Ratio'] = np.nan

imputer = IterativeImputer(random_state=42)
features = ['Lat', 'Long_', 'Distance_to_Centroid']
targets = ['Deaths', 'Case_Fatality_Ratio', 'Confirmed_Cases']
impute_data = train_data[features + targets]
imputed_data = pd.DataFrame(imputer.fit_transform(impute_data), columns=impute_data.columns)
train_data[targets] = imputed_data[targets]

X = train_data[features]
y = train_data['Case_Fatality_Ratio']

nan_indices = y.isna()
if nan_indices.any():
    print(f"Dropping {nan_indices.sum()} rows with NaN in 'Case_Fatality_Ratio'.")
    X = X[~nan_indices]
    y = y[~nan_indices]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

param_grid = {
    'n_estimators': [200, 300, 400],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 6, 8],
    'min_samples_split': [2, 4, 6],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train_scaled.ravel())

cv_scores = cross_val_score(
    grid_search.best_estimator_,
    X_train,
    y_train_scaled.ravel(),
    scoring='neg_mean_squared_error',
    cv=5
)
cv_rmse = np.sqrt(-cv_scores)
print(f"Cross-Validation RMSE: {cv_rmse.mean():.4f} (±{cv_rmse.std():.4f})")

y_train_pred_scaled = grid_search.predict(X_train)
y_train_pred_original = scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1))
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_original))

y_test_pred_scaled = grid_search.predict(X_test)
y_test_pred_original = scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_original))

print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
feature_importances = grid_search.best_estimator_.feature_importances_
feature_names = features
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("\nFeature Importances:")
print(importance_df)

test_data = pd.read_csv('/content/test_points.xlsx - Sheet1.csv')

test_data = test_data.dropna(subset=['Lat', 'Long_'])

test_data['Distance_to_Centroid'] = test_data.apply(import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from geopy.distance import geodesic
import os

os.environ["LOKY_MAX_CPU_COUNT"] = "4"

train_data = pd.read_csv("C:/Users\ARYAN\Downloads/train_data.xlsx - Sheet1.csv")

train_data = train_data.dropna(subset=['Lat', 'Long_'])

outbreak_centroid = (train_data[['Lat', 'Long_']].median().values)
train_data['Distance_to_Centroid'] = train_data.apply(
    lambda row: geodesic((row['Lat'], row['Long_']), outbreak_centroid).km,
    axis=1
)

train_data['Confirmed_Cases'] = np.where(
    (train_data['Deaths'].notna()) &
    (train_data['Case_Fatality_Ratio'].notna()) &
    (train_data['Case_Fatality_Ratio'] != 0),
    (train_data['Deaths'] / train_data['Case_Fatality_Ratio']) * 100,
    np.nan
)

train_data.loc[train_data['Case_Fatality_Ratio'] > 100, 'Case_Fatality_Ratio'] = np.nan

imputer = IterativeImputer(random_state=42)
features = ['Lat', 'Long_', 'Distance_to_Centroid']
targets = ['Deaths', 'Case_Fatality_Ratio', 'Confirmed_Cases']
impute_data = train_data[features + targets]
imputed_data = pd.DataFrame(imputer.fit_transform(impute_data), columns=impute_data.columns)
train_data[targets] = imputed_data[targets]

X = train_data[features]
y = train_data['Case_Fatality_Ratio']

nan_indices = y.isna()
if nan_indices.any():
    print(f"Dropping {nan_indices.sum()} rows with NaN in 'Case_Fatality_Ratio'.")
    X = X[~nan_indices]
    y = y[~nan_indices]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

param_grid = {
    'n_estimators': [300, 400, 500],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_depth': [6, 8, 10],
    'min_samples_split': [2, 4, 8],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train_scaled.ravel())

cv_scores = cross_val_score(
    grid_search.best_estimator_,
    X_train,
    y_train_scaled.ravel(),
    scoring='neg_mean_squared_error',
    cv=5
)
cv_rmse = np.sqrt(-cv_scores)
print(f"Cross-Validation RMSE: {cv_rmse.mean():.4f} (±{cv_rmse.std():.4f})")

y_train_pred_scaled = grid_search.predict(X_train)
y_train_pred_original = scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1))
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_original))

y_test_pred_scaled = grid_search.predict(X_test)
y_test_pred_original = scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_original))

print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
feature_importances = grid_search.best_estimator_.feature_importances_
feature_names = features
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("\nFeature Importances:")
print(importance_df)

test_data = pd.read_csv("C:/Users\ARYAN\Downloads/test_points.xlsx - Sheet1.csv")

test_data = test_data.dropna(subset=['Lat', 'Long_'])

test_data['Distance_to_Centroid'] = test_data.apply(
    lambda row: geodesic((row['Lat'], row['Long_']), outbreak_centroid).km,
    axis=1
)

X_test_data = test_data[features]
test_data['Predicted_CFR'] = scaler.inverse_transform(grid_search.predict(X_test_data).reshape(-1, 1))

X_deaths = train_data[features]
y_deaths = train_data['Deaths']

nan_indices = y_deaths.isna()
if nan_indices.any():
    print(f"Dropping {nan_indices.sum()} rows with NaN in 'Deaths'.")
    X_deaths = X_deaths[~nan_indices]
    y_deaths = y_deaths[~nan_indices]

deaths_model = GradientBoostingRegressor(n_estimators=400, learning_rate=0.05, max_depth=8, random_state=42)
deaths_model.fit(X_deaths, y_deaths)

test_data['Predicted_Deaths'] = deaths_model.predict(X_test_data)

test_data['Confirmed_Cases'] = (test_data['Predicted_Deaths'] / test_data['Predicted_CFR']) * 100  # CFR = (Deaths / Confirmed) * 100

test_data[['Lat', 'Long_', 'Predicted_CFR', 'Predicted_Deaths', 'Confirmed_Cases']].to_csv("C:\diploma\predictions.csv.csv", index=False)

print("Predictions saved to 'test_pred.csv'")

    lambda row: geodesic((row['Lat'], row['Long_']), outbreak_centroid).km,
    axis=1
)

X_test_data = test_data[features]
test_data['Predicted_CFR'] = scaler.inverse_transform(grid_search.predict(X_test_data).reshape(-1, 1))

X_deaths = train_data[features]
y_deaths = train_data['Deaths']

nan_indices = y_deaths.isna()
if nan_indices.any():
    print(f"Dropping {nan_indices.sum()} rows with NaN in 'Deaths'.")
    X_deaths = X_deaths[~nan_indices]
    y_deaths = y_deaths[~nan_indices]

deaths_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
deaths_model.fit(X_deaths, y_deaths)

test_data['Predicted_Deaths'] = deaths_model.predict(X_test_data)

test_data['Confirmed_Cases'] = (test_data['Predicted_Deaths'] / test_data['Predicted_CFR']) * 100  # CFR = (Deaths / Confirmed) * 100

test_data[['Lat', 'Long_', 'Predicted_CFR', 'Predicted_Deaths', 'Confirmed_Cases']].to_csv('test_pred.csv', index=False)

print("Predictions saved to 'test_pred.csv'")


Dropping 88 rows with NaN in 'Case_Fatality_Ratio'.
Cross-Validation RMSE: 0.1028 (±0.0047)
Train RMSE: 0.6588
Test RMSE: 0.8393

Feature Importances:
                Feature  Importance
0                   Lat    0.362801
1                 Long_    0.335968
2  Distance_to_Centroid    0.301231
Dropping 88 rows with NaN in 'Deaths'.
Predictions saved to 'test_pred.csv'
