In [3]:
# train_model.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import joblib


In [9]:
# Load your dataset
# Load your dataset with a different encoding
data = pd.read_csv("TerrDB1.csv", encoding='ISO-8859-1')


# Fill missing or blank 'nkill' values with 0
data['nkill'] = data['nkill'].apply(lambda x: 0 if pd.isnull(x) or x == '' else x)

# Verify that 'nkill' has been handled
print(data['nkill'].isnull().sum())  # Should output 0 for missing values


0


In [11]:
# Separate features (X) and target variable (y)
X = data.drop(columns=['nkill'])  # Independent variables
y = data['nkill'].apply(lambda x: 0 if x == 0 else (1 if x == 1 or x == 2 else 2))  # Categorized target variable

# Impute missing values in independent variables
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Verify that the imputation was successful
print(X_imputed.isnull().sum())  # Should output 0 for missing values


Unnamed: 0    0
eventid       0
iyear         0
imonth        0
iday          0
             ..
INT_LOG       0
INT_IDEO      0
INT_MISC      0
INT_ANY       0
related       0
Length: 72, dtype: int64


In [13]:
# One-hot encoding for categorical variables
categorical_columns = X_imputed.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = pd.DataFrame(encoder.fit_transform(X_imputed[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns))

# Combine the encoded categorical columns with the rest of the data
X_imputed = X_imputed.drop(columns=categorical_columns)
X = pd.concat([X_imputed, X_encoded], axis=1)

# Verify the final feature set
print(X.head())


   Unnamed: 0_1  Unnamed: 0_2  Unnamed: 0_3  Unnamed: 0_4  Unnamed: 0_5  \
0           0.0           0.0           0.0           0.0           0.0   
1           1.0           0.0           0.0           0.0           0.0   
2           0.0           1.0           0.0           0.0           0.0   
3           0.0           0.0           1.0           0.0           0.0   
4           0.0           0.0           0.0           1.0           0.0   

   Unnamed: 0_6  Unnamed: 0_7  Unnamed: 0_8  Unnamed: 0_9  Unnamed: 0_10  ...  \
0           0.0           0.0           0.0           0.0            0.0  ...   
1           0.0           0.0           0.0           0.0            0.0  ...   
2           0.0           0.0           0.0           0.0            0.0  ...   
3           0.0           0.0           0.0           0.0            0.0  ...   
4           0.0           0.0           0.0           0.0            0.0  ...   

   related_202003050018, 202003050023  related_202004010004, 2

In [15]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the shape of the data
print(X_train.shape, X_test.shape)  # Verify if the data split worked correctly


(11143, 81822) (2786, 81822)


In [17]:
# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Save the trained model and encoder to files
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(encoder, 'one_hot_encoder.pkl')
joblib.dump(imputer, 'imputer.pkl')  # Save the imputer for independent variables


['imputer.pkl']