In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve, precision_score, recall_score


### Original DataFrame 

In [None]:
df = pd.read_excel('dataviz.xlsx')
df['conflict (within 5 years)'].value_counts()

In [None]:
col_y = ['code','country_name','year','conflict (within 5 years)']
dfy = df[col_y]


### Features Selections

In [None]:
col = ['code','country_name','year']
df = df.set_index(col)

# ---------- suprimer ce qui est avant ---------------------------------------------------------
dfx = df
columns_to_drop = ['conflict (within 5 years)','gni','judaism_orthodox',
'judaism_conservative','control_corruption_rank','judaism_reform'
,'islam_sunni','islam_ibadhi','buddhism_other','orthodox_percent','conservative_percent','shinto_percent',
'jainism_percent','confucianism_percent','total_percent','religion_all','jainism_all','sikhism_all','taoism_all','syncretism_all']

dfx.drop(columns=columns_to_drop, inplace=True)

dfx.info()


In [None]:
# détection des correlations supérieures à 0,9

correlation_matrix = dfx.corr().abs()

viz = correlation_matrix.mask(correlation_matrix < 0.9, np.nan)


for col in viz.columns:
    unique_values = viz[col].nunique()
    value_counts = viz[col].value_counts()
    if unique_values > 1:
        print("Colonnes:", col)
        print("Nombre de valeurs uniques:", value_counts)
        print("=" * 40)







In [None]:
# correlationmatrix map indiquant toutes les valeures superieure à 0.9 val.abs
correlation_matrix = dfx.corr().abs()

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
plt.figure(figsize=(15, 20))  
sns.heatmap(correlation_matrix[correlation_matrix > 0.9], annot=False, cmap='coolwarm', linewidths=0.5,mask = mask)
plt.title('Heatmap général')
plt.show()

### preparation train/test split

In [None]:
dfx.reset_index(inplace=True)
# ------------Créer manuellemnt X_train -------------------
X_train = dfx[(dfx['year'] == 1995) | (dfx['year'] == 2000) | (dfx['year'] == 2005)]

col = ['code','country_name','year']
X_train = X_train.set_index(col)

y_train = dfy[(dfy['year'] == 1995) | (dfy['year'] == 2000) | (dfy['year'] == 2005)]

# ------------Créer manuellemnt X_test -------------------
X_test = dfx[dfx['year'] == 2010]

col = ['code','country_name','year']
X_test = X_test.set_index(col)

y_test  = dfy[dfy['year'] == 2010]

# ------------Créer manuellemnt X_real -------------------
X_real = dfx[dfx['year'] == 2015]

col = ['code','country_name','year']
X_real = X_real.set_index(col)

y_real  = dfy[dfy['year'] == 2015]

col = ['code','year','country_name']
y_real.drop(columns=col, inplace=True)
y_train.drop(columns=col, inplace=True)
y_test.drop(columns=col, inplace=True)


In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

### First train with Logistic Regression Model

In [None]:
# Model logisticRegression first try

model = LogisticRegression()

param_grid = {
    'C': np.arange(0.018,0.025, step=0.001),
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 1000, 5000],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs= 2
)

grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_



In [None]:
print("Meilleurs hyperparamètres :", grid_search.best_params_)
print("Meilleur score (AUC) :", grid_search.best_score_)

### TRY OTHER MODEL

In [None]:
# Model LinearSVC()
model = LinearSVC()

param_grid = {
    'C': np.arange(0,0.19, step=0.01),
    'penalty': ['l1', 'l2'],
    'max_iter': [5000, 5100, 5500],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=2
)

grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_


In [None]:
print("Meilleurs hyperparamètres :", grid_search.best_params_)
print("Meilleur score (AUC) :", grid_search.best_score_)

In [None]:


model = RandomForestClassifier()

param_grid = {
    'n_estimators': [150,200,250],  
    'max_depth': range(0,10),  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'bootstrap': [True, False]  
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=2
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


In [None]:
print("Meilleurs hyperparamètres :", grid_search.best_params_)
print("Meilleur score (AUC) :", grid_search.best_score_)

### USING RFECV

In [None]:

# rfecv = RFECV(estimator= RandomForestClassifier(),step=1,min_features_to_select = 1, cv=StratifiedKFold(n_splits=3), scoring='roc_auc')
# rfecv.fit(X_train, y_train.values.ravel())
# scores = rfecv.cv_results_['mean_test_score']

# print(scores)

# print(f"Optimal number of features: {rfecv.n_features_}")



In [None]:
# selected_features = rfecv.feature_names_in_[rfecv.support_]
# selected_features


In [None]:


# n_scores = len(rfecv.cv_results_["mean_test_score"])
# plt.figure()
# plt.xlabel("Number of features selected")
# plt.ylabel("Mean test ROC_AUC")
# plt.errorbar(
#     range(15, n_scores + 15),
#     rfecv.cv_results_["mean_test_score"],
#     yerr=rfecv.cv_results_["std_test_score"],
# )
# plt.title("Recursive Feature Elimination \nwith correlated features")
# plt.show()

In [None]:
# X_select = rfecv.transform(X_train)
# X_select_test = rfecv.transform(X_test)
# X_select_real = rfecv.transform(X_real)
# ---------------------------------------------------------------------------

# Features selected after the first RFECV

col_f = ['population_total', 'pcap', 'primary_completion(rate)',
       'control_corruption_estimate', 'army_weight',
       'battle_related(number of death)', 'homicide_(per 100k people)',
       'christianity_all', 'islam_all', 'number of conflicts']

X_select = X_train[col_f]
X_select_test = X_test[col_f]
X_select_real = X_real[col_f]


In [None]:
# rfecv = RFECV(estimator= DecisionTreeClassifier(),step=1,min_features_to_select = 1, cv=StratifiedKFold(n_splits=3), scoring='roc_auc')
# rfecv.fit(X_train, y_train.values.ravel())
# scores = rfecv.cv_results_['mean_test_score']

# print(scores)

# print(f"Optimal number of features: {rfecv.n_features_}")

In [None]:
# rfecv.feature_names_in_[rfecv.support_]

### TRY TO BOOST THE ALGORITHM

In [None]:
model = DecisionTreeClassifier()

param_grid = {
    'max_depth': range(1, 10),  
    'min_samples_split': [2, 3, 1],  
    'min_samples_leaf': [14, 13, 12],  
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=2,
    return_train_score=True
)


grid_search.fit(X_select, y_train)

best_model = grid_search.best_estimator_

print("Meilleurs hyperparamètres :", grid_search.best_params_)
print("Meilleur score (AUC) :", grid_search.best_score_)
print("Train score", grid_search.cv_results_['mean_train_score'])

In [None]:
model = RandomForestClassifier()

param_grid = {
    'n_estimators': [200],
    'max_depth': [6],
    'min_samples_split': [9],  
    'min_samples_leaf': [3],  
    'bootstrap': [True] 
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=2,
    return_train_score=True
)

grid_search.fit(X_select, y_train)

best_model = grid_search.best_estimator_

In [None]:
print("Meilleurs hyperparamètres :", grid_search.best_params_)
print("Meilleur score (AUC) :", grid_search.best_score_)
# print("Train score", grid_search.cv_results_['mean_train_score'])




### TEST OUR MODEL

In [None]:


thr = 0.4

y_pred_proba = best_model.predict_proba(X_select_test)[:, 1]
y_pred = y_pred_proba > thr
roc_auc_RF = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, (y_pred_proba >= thr).astype(int))
recall = recall_score(y_test, (y_pred_proba >= thr).astype(int))
confusion = confusion_matrix(y_test, y_pred)

print("Aire sous la courbe ROC (AUC) :", roc_auc_RF)
print("Précision :", precision)
print("Rappel :", recall)
print("confusion matrix :" , confusion)

In [None]:
plt.figure(figsize=(6, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Reds', cbar=False)
plt.xlabel('Prediction')
plt.ylabel('True Values')
plt.title('Confusion Matrix')
plt.show()

In [None]:


fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)


plt.figure(figsize=(10, 6))
plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')

plt.plot(fpr, tpr, color='b', lw=2, label=f'Random Forest (AUC = {roc_auc_RF:.2f})')

plt.title('Courbe ROC pour Random Forest')
plt.xlabel('Taux de Faux Positifs (FPR)')
plt.ylabel('Taux de Vrais Positifs (TPR)')
plt.legend()


plt.grid(True)
plt.show()

### Prediction of the conflict between 2020 - 2024

In [None]:
y_pred_proba_real = best_model.predict_proba(X_select_real)[:, 1]

data_real =  X_real[col_f].copy()


data_real['pred_score'] = y_pred_proba_real

data_real['pred'] =  data_real['pred_score'] >= thr

data_real.to_csv('Prediction_2020_2024.csv', index=True)

data_real

Prediction = data_real[data_real['pred_score'] >= 0.4]

Prediction

