# Problem 1: Predicting Number of Attacks Next Month

## Checking pre prerequisites before starting to build the model

In [126]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_theme (style="darkgrid")


In [179]:
df = pd.read_csv("linear_regression_dataset.csv")

prob1_dataset = df

In [84]:
prob1_dataset.head()

Unnamed: 0,time_start,launched,destroyed,model,launched_next_month
0,2025-04-12 20:30:00,55.0,43.0,Shahed-136/131,88.0
1,2025-04-11 20:00:00,88.0,56.0,Shahed-136/131,1.0
2,2025-04-12 06:00:00,1.0,1.0,Молнія,1.0
3,2025-04-12 06:00:00,1.0,1.0,Lancet,39.0
4,2025-04-10 09:00:00,39.0,24.0,Shahed-136/131,1.0


In [180]:
### Dropping any null values
prob1_dataset = prob1_dataset.dropna()

In [189]:
prob1_dataset['time_start'] = pd.to_datetime(prob1_dataset['time_start'])

# Extracting month and day from time star
prob1_dataset['month'] = prob1_dataset['time_start'].dt.month
prob1_dataset['day'] = prob1_dataset['time_start'].dt.day

# Now drop the datetime column
prob1_dataset.drop(columns=['time_start'], inplace=True)

In [91]:
prob1_dataset.dtypes

time_start             datetime64[ns]
launched                      float64
destroyed                     float64
model                          object
launched_next_month           float64
month                           int32
day                             int32
dtype: object

In [182]:
# One hot encoding the model column as it is an object dt which the model cannot read
prob1_dataset = pd.get_dummies(prob1_dataset, columns=['model'])


## Building the model

In [190]:
# Setting features and target
X = prob1_dataset.drop(columns=['launched_next_month'])
y = prob1_dataset['launched_next_month']

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

RMSE: 25.83


In [166]:
y_mean = y.mean()
print('Average number of missiles launched next month',f"{y_mean:.2f}")  # Prints average number of missiles launched next month


Average number of missiles launched next month 13.70


### Re creating the model with extra proccessing for increased performance

In [183]:
# Extract additional date features
prob1_dataset['day_of_week'] = prob1_dataset['time_start'].dt.dayofweek  # 0=Monday, 6=Sunday
prob1_dataset['week_of_month'] = prob1_dataset['time_start'].dt.isocalendar().week % 4  # Week of the month
prob1_dataset['month'] = prob1_dataset['time_start'].dt.month  # Month (1-12)


### Adding rolling averages

In [184]:
# Sort by time (important for rolling functions)
prob1_dataset.sort_values('time_start', inplace=True)

# Create rolling average of 'launched' for the past 7 days
prob1_dataset['launched_rolling_avg_7'] = prob1_dataset['launched'].rolling(window=7, min_periods=1).mean()

# Create rolling average of 'launched' for the past 30 days
prob1_dataset['launched_rolling_avg_30'] = prob1_dataset['launched'].rolling(window=30, min_periods=1).mean()


### Trying Random Forest to see if theres any greater performance increase

### Re-modelling using hyperparameter tuning

In [185]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30],   # Max depth of trees
    'min_samples_split': [2, 5, 10],   # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],     # Minimum samples required at a leaf node
    'bootstrap': [True, False]         # Whether bootstrap samples are used
}

# Initialize the model
rf = RandomForestRegressor(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit grid search
grid_search.fit(X_train, y_train)

# Best hyperparameters
print(f"Best hyperparameters: {grid_search.best_params_}")

# Train model with best parameters
best_rf_model = grid_search.best_estimator_

# Predict with the best model
y_pred = best_rf_model.predict(X_test)

# Evaluate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE after hyperparameter tuning: {rmse:.2f}")


Fitting 3 folds for each of 216 candidates, totalling 648 fits


ValueError: 
All the 648 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
648 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sonny bell\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sonny bell\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\sonny bell\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_forest.py", line 360, in fit
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self,
        ^^^^^
    ...<5 lines>...
        ensure_all_finite=False,
        ^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\sonny bell\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sonny bell\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "c:\Users\sonny bell\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 931, in check_array
    dtype_orig = np.result_type(*dtypes_orig)
numpy.exceptions.DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>)


In [186]:
# Create a lag feature for 'launched' — past 1 month
prob1_dataset['launched_lag_1'] = prob1_dataset['launched'].shift(1)

# Create a lag feature for 'destroyed' — past 1 month
prob1_dataset['destroyed_lag_1'] = prob1_dataset['destroyed'].shift(1)

# Create a lag feature for 'launched' — past 3 months
prob1_dataset['launched_lag_3'] = prob1_dataset['launched'].shift(3)

# Drop missing values (due to shifting)
prob1_dataset.dropna(inplace=True)


In [168]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Drop 'time_start' and any other non-numeric columns that remain
prob1_dataset.drop(columns=['time_start'], inplace=True)

# Handle missing values after creating rolling averages (rolling averages will have NaN initially)
prob1_dataset.dropna(inplace=True)

# Prepare features (X) and target (y)
X = prob1_dataset.drop(columns=['launched_next_month'])
y = prob1_dataset['launched_next_month']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE after improvements: {rmse:.2f}")


KeyError: "['time_start'] not found in axis"

In [156]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = xgb_model.predict(X_test)

# RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE with XGBoost: {rmse:.2f}")


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:time_start: datetime64[ns]

In [187]:
# Check the number of features in X_train
print(X_train.shape[1])

# Check the length of the feature importances
print(len(importances))


67
70


In [188]:
y_mean = y.mean()
print('Average number of missiles launched next month',f"{y_mean:.2f}")  # Prints average number of missiles launched next month


Average number of missiles launched next month 13.70
