imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import pymannkendall as mk
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
from scipy.stats import chi2_contingency, kruskal
from statsmodels.tsa.stattools import ccf
import warnings
warnings.filterwarnings('ignore')

initialization

In [None]:
# Create output directory for plots
output_dir = "statistical_test_results"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the dataset
df = pd.read_csv("Egypt_terr_augmented.csv")
print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")

# Basic data preparation
# Get the name of the last column which contains inflation data
last_column = df.columns[-1]

# Convert date components to datetime
df['date'] = pd.to_datetime({'year': df['iyear'],
                            'month': df['imonth'],
                            'day': df['iday']}, errors='coerce')

# Extract inflation data from the last column in the same dataset
# Create a dataframe with unique years and their corresponding inflation values
inflation_data = df[['iyear', last_column]].drop_duplicates().reset_index(drop=True)
print(f"Inflation data extracted with {inflation_data.shape[0]} unique year entries")

# Create yearly aggregation for time series analysis
yearly_counts = df.groupby('iyear').size().reset_index(name='incident_count')
yearly_counts = pd.merge(yearly_counts, inflation_data, on='iyear', how='left')

# Create regional aggregation
region_counts = df.groupby('provstate').size().reset_index(name='incident_count')
region_counts = region_counts.sort_values('incident_count', ascending=False)

# Calculate fatalities by region
region_fatalities = df.groupby('provstate')['nkill'].agg(['sum', 'mean', 'count']).reset_index()
region_fatalities = region_fatalities.sort_values('sum', ascending=False)


Dataset loaded with 2478 rows and 46 columns
Inflation data extracted with 36 unique year entries


Test 1 The frequency of terrorism incidents has significantly increased
over the studied period (1970-2017)

In [None]:
# Test 1
result = mk.original_test(yearly_counts['incident_count'])
print(f"Trend: {'Increasing' if result.trend == 'increasing' else 'Decreasing' if result.trend == 'decreasing' else 'No trend'}")
print(f"h (hypothesis): {result.h}")
print(f"p-value: {result.p:.6f}")
print(f"z-value: {result.z:.6f}")
print(f"Tau: {result.Tau:.6f}")
print(f"s: {result.s}")
print(f"var_s: {result.var_s}")
print(f"slope: {result.slope:.6f}")
print(f"intercept: {result.intercept:.6f}")

# Visualize the trend
plt.figure(figsize=(12, 6))
plt.plot(yearly_counts['iyear'], yearly_counts['incident_count'], marker='o')
plt.title('Terrorism Incidents in Egypt (1970-2017)')
plt.xlabel('Year')
plt.ylabel('Number of Incidents')
plt.grid(True)
plt.savefig(os.path.join(output_dir, "Test_1.png"))
plt.close()

Trend: Increasing
h (hypothesis): True
p-value: 0.000127
z-value: 3.831878
Tau: 0.444444
s: 280.0
var_s: 5301.333333333333
slope: 1.000000
intercept: -11.500000


Test 2 Certain geographical regions within Egypt are disproportionately
affected by terrorism incidents

In [None]:
# 2. Chi-Square Test for Regional Distribution
# Get top 10 regions by incident count
top_regions = region_counts.head(10)
print("Top 10 regions by incident count:")
print(top_regions)

# Chi-square test (comparing to equal distribution)
observed = top_regions['incident_count'].values
expected = np.ones_like(observed) * observed.sum() / len(observed)
chi2, p_value = stats.chisquare(observed, expected)
print(f"Chi-square statistic: {chi2:.4f}")
print(f"p-value: {p_value:.10f}")
print(f"Degrees of freedom: {len(observed) - 1}")
if p_value < 0.05:
    print("Result: Reject null hypothesis - regions are disproportionately affected")
else:
    print("Result: Fail to reject null hypothesis - no significant disproportion")

# Visualize regional distribution
plt.figure(figsize=(14, 6))
sns.barplot(x='provstate', y='incident_count', data=top_regions)
plt.title('Top 10 Regions by Terrorism Incidents in Egypt (1970-2017)')
plt.xlabel('Region')
plt.ylabel('Number of Incidents')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "Test_2.png"))
plt.close()


Top 10 regions by incident count:
      provstate  incident_count
27  North Sinai            1276
11        Cairo             227
23        Minya             187
8         Asyut             128
17         Giza             117
4    Al Sharqia              61
15       Faiyum              60
5    Alexandria              51
42      Unknown              40
16      Gharbia              30
Chi-square statistic: 5894.5158
p-value: 0.0000000000
Degrees of freedom: 9
Result: Reject null hypothesis - regions are disproportionately affected
