In [None]:
import pandas as pd 
import world_bank_data as wb
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
import pycountry
import numpy as np

### FONCTION PRE - TRAITEMENT DATAFRAME

In [None]:
# Fonction pour générer les features venant de la WB

def generate_df(indicator):
    date_range = range(1995, 2020)
    dfs_by_year = {}

    for year in date_range:
        serie = wb.get_series(indicator, date=year, id_or_value='id', simplify_index=True)
        df_year = pd.DataFrame({'changer': serie})
        df_year['year'] = year
        dfs_by_year[year] = df_year

    df_feature = pd.concat(dfs_by_year.values())

    categories = pd.cut(df_feature['year'], bins=range(1990, 2021, 5), labels=[f'{i}-{i+4}' for i in range(1990, 2016, 5)], right=False)

    df_feature['Catégorie'] = categories

    df_feature = df_feature.reset_index()

    return df_feature


In [None]:
# Fonction pour générer les features venant de l'UCDP

def generate_df2(file_name):
    df = pd.read_csv(file_name)

    mask = (df['year'] >= 1990) & (df['year'] <= 2024)
    df = df[mask]

    columns = ['location', 'year']
    df = df[columns]

    categories = pd.cut(df['year'], bins=range(1990, 2026, 5), labels=[f'{i}-{i+4}' for i in range(1990, 2021, 5)], right=False)

    df['Catégorie'] = categories
    df['Number of conflicts'] = 1
    df['Conflict in the next 5 years'] = 'yes'

    return df

### RECHERCHE INDICE 

In [None]:
ind = wb.search_indicators('race') 
ind.head(50)


### ECONOMICS FEATURES

In [None]:
# Population Total

df = generate_df('SP.POP.TOTL')
df = df.rename(columns={'changer': 'Population_Total'})

df1 = df
df1


In [None]:
# GDP current

df = generate_df('NY.GDP.MKTP.CD')
df = df.rename(columns={'changer': 'GDP(current $)'})

df2 = df
df2

In [None]:
# GDP growth

df = generate_df('NY.GDP.MKTP.KD.ZG')
df = df.rename(columns={'changer': 'GDP_growth'})

df3 = df
df3

In [None]:
# GDP per Capita PPP

df= generate_df('NY.GDP.PCAP.PP.CD')
df = df.rename(columns={'changer': 'GDP_per_capita_PPP'})
df4 = df

df4

In [None]:
# GNI(current $)

df = generate_df('NY.GNP.ATLS.CD')
df = df.rename(columns={'changer': 'GNI(current $)'})
df5 = df

df5

In [None]:
# PCAP

df = generate_df('NY.GNP.PCAP.CD')
df = df.rename(columns={'changer': 'PCAP'})
df6 = df
df6

In [None]:
df_economy = pd.concat([df1, df2,df3,df4,df5,df6], axis=1)

df_economy = df_economy.loc[:, ~df_economy.columns.duplicated()]

df_economy

### SOCIALS FEATURES

In [None]:
# Poverty

df = generate_df('SI.POV.UMIC')
df = df.rename(columns={'changer': 'Poverty gap'})

df1 = df
df1

In [None]:
# Primary
df = generate_df('SE.PRM.CMPT.ZS')
df = df.rename(columns={'changer': 'Primary_completion(rate)'})

df2 = df
df2

In [None]:
# School

df = generate_df('SE.PRE.ENRR')
df = df.rename(columns={'changer': 'School_enrollement_gross'})

df3 = df
df3 

In [None]:
# Hospitals and bed

df = generate_df('SH.MED.BEDS.ZS')
df = df.rename(columns={'changer': 'Hospital_bed(per 1000 people)'})

df4 = df
df4 

In [None]:
#Number of infant death

df = generate_df('SP.DYN.IMRT.IN')
df = df.rename(columns={'changer': 'Mortality_rate(per 1000 births)'})

df5 = df
df5

In [None]:
# Mortality rate

df = generate_df('SP.DYN.CDRT.IN')
df = df.rename(columns={'changer': 'Mortality_rate(per 1000 people)'})

df6 = df
df6

In [None]:

# Population living slum 

df = generate_df('EN.POP.SLUM.UR.ZS')
df = df.rename(columns={'changer': 'population_living_in_slum (%)'})

df7 = df
df7

In [None]:
df_social = pd.concat([df1, df2,df3,df4,df5,df6,df7], axis=1)

df_social = df_social.loc[:, ~df_social.columns.duplicated()]

df_social

### POLITICS and MILITARIES FEATURES

In [None]:
# control of corruption estimate
df = generate_df('CC.EST')
df = df.rename(columns={'changer': 'control_corruption_estimate'})

df1 = df
df1

In [None]:
# controle corruption rank

df = generate_df('CC.PER.RNK')
df = df.rename(columns={'changer': 'control_corruption_rank'})

df2 = df
df2

In [None]:
# Nombre de personnels des forces armées

df = generate_df('MS.MIL.TOTL.P1')
df = df.rename(columns={'changer': 'army_weight'})

df3 = df
df3

In [None]:
# Depense militaire % du pib

df = generate_df('MS.MIL.XPND.GD.ZS')
df = df.rename(columns={'changer': 'army_expenditure(% GDP)'})

df4 = df
df4

In [None]:

# Conflits armés mort

df = generate_df('VC.BTL.DETH')
df = df.rename(columns={'changer': 'Battle_related(number of death)'})

df5 = df
df5

In [None]:

df = generate_df('VC.IHR.PSRC.P5')
df = df.rename(columns={'changer': 'Homicide_(per 100K people)'})

df6 = df
df6

In [None]:
df_politics = pd.concat([df1, df2,df3,df4,df5,df6], axis=1)

df_politics = df_politics.loc[:, ~df_politics.columns.duplicated()]

df_politics

### TERRITORY AND CULTURE FEATURES

In [None]:
! kaggle datasets download -d umichigan/world-religions

In [None]:
df = pd.read_csv('national.csv')

mask = df['year'] < 1995

df_culture= df.drop(df[mask].index)

df_culture



In [None]:
pays_lst = df_culture ['state'].tolist()
# print(pays_lst)

for i in range(len(pays_lst)):
    pays_lst[i] = pays_lst[i].replace("St.", "Saint")
     

for i, v in enumerate(pays_lst):

    if v == 'Antigua & Barbuda':
        # print(i,v)
        pays_lst[i] ="Antigua and Barbuda"
        # print(12*'_')
    elif v == 'Cape Verde':
        # print(i,v)
        pays_lst[i] ="Cabo Verde"
        # print(12*'_')

    elif v == 'Ivory Coast':
        # print(i,v)
        pays_lst[i] ="Côte d'Ivoire"
        # print(12*'_')
    
    elif v == 'Democratic Republic of the Congo':
            # print(i,v)
            pays_lst[i] ="Congo, The Democratic Republic of the"
            # print(pays_lst[i])
            # print(12*'_')
    elif v == 'Swaziland':
            # print(i,v)
            pays_lst[i] ="Eswatini"
            # print(pays_lst[i])
            # print(12*'_')

    elif v == 'Laos':
            # print(i,v)
            pays_lst[i] ="Lao People's Democratic Republic"
            # print(pays_lst[i])
            # print(12*'_')
    elif v == 'East Timor':
            # print(i,v)
            pays_lst[i] ="Timor-Leste"
            # print(pays_lst[i])
            # print(12*'_')
    elif v == 'Yugoslavia':
            # print(i,v)
            pays_lst[i] ="Serbia"
            # print(pays_lst[i])
            # print(12*'_')


code_pays = []

for pays in pays_lst:
    if pays not in ['Niger']:
        pays_code = pycountry.countries.search_fuzzy(pays)[0].alpha_3
        code_pays.append(pays_code)
    else:
        code_pays.append('NER')
    
        

        



In [None]:
df_culture['code'] = code_pays

df_culture

### CONFLICTS FEATURES + LABEL

In [None]:
# conflit armée avec implication du gouvernement

df = generate_df2("UcdpPrioConflict_v23_1.csv")
df1 = df 
view = df1[df1['location'] == "yugoslavia"]
view



In [None]:
df = generate_df2("Nonstate_v23_1.csv")
df2 = df 


view = df2[df2['location'] == "France"]
view


In [None]:
df_conflict = pd.concat([df1, df2])

view = df_conflict[df_conflict['location'] == "Ukraine"]

view

# df_conflict


In [None]:
split_dfs = []

for _, row in df_conflict.iterrows():
    locations = row['location'].split(', ')
    for location in locations:
        new_row = {
            'country': location,
            'year': row['year'],
            'Catégorie': row['Catégorie'],
            'Number of conflicts': row['Number of conflicts'],
            'Conflict in the next 5 years': row['Conflict in the next 5 years']
        }
        split_dfs.append(pd.DataFrame([new_row]))  


new_conflict = pd.concat(split_dfs, ignore_index=True)

new_conflict

### Merged Dataframes

### MERGED WB DF

In [None]:
# df_economy.merge(df_social, how="outer", on=["Country", "year"]).merge(df_politics, how="outer", on=["Country", "year"]) 

# En cas de soucis dans les merged utiliser plutôt le code ci-dessus pour éviter les problematiques.

In [None]:
# merger tous les WB features en une df 
df_WB = pd.concat([df_economy,df_social,df_politics], axis=1)

df_WB = df_WB.loc[:, ~df_WB.columns.duplicated()]

df_WB

In [None]:
# Transformer la df de WB en df exploitable pour le projet

c_mean = [c for c in df_WB.columns if c not in ['Country','Battle_related(number of death)','year','Catégorie',]]
agg_operations = {}

for c in df_WB.columns:
    if c in c_mean:
        agg_operations[c] = 'mean'
    elif c == 'Battle_related(number of death)' :
        agg_operations[c] = 'sum'

# print(agg_operations)

WB = df_WB.groupby(['Country', 'Catégorie']).agg(agg_operations)
WB = WB.reset_index()
masque = WB['Catégorie'] != "1990-1994"
WB = WB[masque]

new_values = {'1990-1994':'1990',
'1995-1999': '1995',
 '2000-2004': '2000',
 '2005-2009': '2005',
 '2010-2014': '2010',
 '2015-2019': '2015' }



WB['Catégorie'] = WB['Catégorie'].replace(new_values)

# -----------------------------------------------------

WB['Catégorie'] =  WB['Catégorie'].astype(int)
code_c = WB['Country'].tolist()

# COUNTRY_MAPPING = {pycountry.countries.get(alpha_3=code)
#                    for code in WB['Country'].unique()}
# WB['Country_name'] = WB['Country'].replace(COUNTRY_MAPPING)
# WB.dropna(subset=["Country"], inplace=True)

country_names = []

for country_code in code_c:
    country = pycountry.countries.get(alpha_3=country_code)
    if country:
        country_names.append(country.name)
    else:
        country_names.append('inconnu')

WB['Country_name'] = country_names

# Réorganisation de colonnes

WB = WB[['Country','Country_name','Catégorie','Population_Total','GDP(current $)','GDP_growth',
'GDP_per_capita_PPP','GNI(current $)','PCAP','Poverty gap', 'Primary_completion(rate)', 'Mortality_rate(per 1000 births)',
'Mortality_rate(per 1000 people)','population_living_in_slum (%)','control_corruption_estimate','control_corruption_rank',
'army_weight','army_expenditure(% GDP)', 'Battle_related(number of death)','Homicide_(per 100K people)',]]

WB = WB[WB['Country_name'] != "inconnu"]
WB = WB.rename(columns={'Country': 'code'})
WB = WB.rename(columns={'Catégorie': 'year'})
WB.to_csv('WB_treated.csv', index=False)

WB






### MERGED KAGGLE DATASET

In [None]:
df_culture = df_culture.drop('state', axis=1)

merged_1 = pd.merge(WB,df_culture , on=['code','year'], how='left')


merged_1

### MERGED UCDP DATAFRAMES

In [None]:
new_conflict

agg_operations = {'Number of conflicts':'sum'}


conflict = new_conflict.groupby(['country', 'Catégorie']).agg(agg_operations)
conflict = conflict.reset_index()
masque = conflict['Catégorie'] != "1990-1994"
conflict = conflict[masque]

new_values = {'1990-1994':'1990',
'1995-1999': '1995',
 '2000-2004': '2000',
 '2005-2009': '2005',
 '2010-2014': '2010',
 '2015-2019': '2015',
 '2020-2024':'2020' }



conflict['Catégorie'] = conflict['Catégorie'].replace(new_values)

conflict = conflict.drop(conflict[conflict['Number of conflicts'] == 0].index)

conflict['country'].unique()



In [None]:
pays_lst = conflict ['country'].tolist()
# print(pays_lst)

for i in range(len(pays_lst)):
    pays_lst[i] = pays_lst[i].replace("St.", "Saint")
     

for i, v in enumerate(pays_lst):

    if v == 'Bosnia-Herzegovina':
        # print(12*'_')
        # print(i,v)
        pays_lst[i] ="Bosnia and Herzegovina"
        # print(pays_lst[i])
        # print(12*'_')

    elif v == 'Cambodia (Kampuchea)':
        # print(12*'_')
        # print(i,v)
        pays_lst[i] ="Cambodia"
        # print(pays_lst[i])
        # print(12*'_')
    
    elif v in ['DR Congo (Zaire)']:
        # print(12*'_')
        # print(i,v)
        pays_lst[i] ="Congo, The Democratic Republic of the"
        # print(pays_lst[i])
        # print(12*'_')

    elif v in ['Ivory Coast']:
        # print(12*'_')
        # print(i,v)
        pays_lst[i] ="Côte d'Ivoire"
        # print(pays_lst[i])
        # print(12*'_')
    
    elif v in ['Madagascar (Malagasy)']:
        # print(12*'_')
        # print(i,v)
        pays_lst[i] ="Madagascar"
        # print(pays_lst[i])
        # print(12*'_')
    
    elif v in ['Myanmar (Burma)']:
        # print(12*'_')
        # print(i,v)
        pays_lst[i] ="Myanmar"
        # print(pays_lst[i])
        # print(12*'_')
    
    elif v in ['Russia (Soviet Union)']:
        # print(12*'_')
        # print(i,v)
        pays_lst[i] ="Russian Federation"
        # print(pays_lst[i])
        # print(12*'_')

    elif v in ['Serbia (Yugoslavia)']:
        # print(12*'_')
        # print(i,v)
        pays_lst[i] ="Serbia"
        # print(pays_lst[i])
        # print(12*'_')

    elif v in ['Yemen (North Yemen)']:
        # print(12*'_')
        # print(i,v)
        pays_lst[i] ="Yemen"
        # print(pays_lst[i])
        # print(12*'_')

# country_mapping = {}

# for pays in pays_lst:
#     pays_code = pycountry.countries.search_fuzzy(pays)[0].alpha_3
    
#     country_mapping[pays] = pays_code

# print(country_mapping)

code_pays = []

for pays in pays_lst:
    if pays not in ['Niger']:
        pays_code = pycountry.countries.search_fuzzy(pays)[0].alpha_3
        code_pays.append(pays_code)
    else:
        code_pays.append('NER')


print(code_pays)

        



In [None]:
conflict['code'] = code_pays
conflict['Catégorie'] = conflict['Catégorie'].astype(int)
conflict.rename(columns={'Catégorie': 'year'}, inplace=True)
conflict.drop(columns=['country'], inplace=True)
conflict.info( )


In [None]:
merged_final = pd.merge(merged_1,conflict , on=['code','year'], how='left')





merged_final['Number of conflicts'].isna().sum()
merged_final['Number of conflicts'].fillna(0, inplace=True)
merged_final.drop(columns=['source_code'], inplace=True)
merged_final.to_csv('df_sans_label.csv', index=False)

merged_final




In [None]:
merged_final['code'].nunique()

### CREATION DU LABEL

In [None]:

merged_final['conflict (within 5 years)'] = (merged_final['Number of conflicts'].shift(-1) > 0).astype(int)


merged_final.loc[merged_final['year'] == 2015, 'conflict (within 5 years)'] = np.nan

merged_final.to_csv('features.csv', index=False)


merged_final.head(50)