In [None]:
import chardet
import pandas as pd


def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        result = chardet.detect(raw_data)
        return result['encoding']


file_encoding = detect_encoding('Archigos.csv')
df = pd.read_csv('Archigos.csv', encoding=file_encoding)
df1 = pd.read_excel('1-GDP增长率.xls')
df2 = pd.read_excel('2-GNI.xls')
df3 = pd.read_excel('3-通货膨胀率-GDP平减指数.xls')
df4 = pd.read_excel('4-5岁以下儿童死亡率.xls')
df5 = pd.read_excel('5-人口增长率.xls')
df6 = pd.read_excel('6-自然资源租金总额.xls')
df7 = pd.read_excel('7-政体得分.xlsx')

In [None]:
import numpy as np
from datetime import datetime

if 'start_year' not in df.columns:
    df['start_year'] = pd.to_datetime(df['startdate']).dt.year
if 'end_year' not in df.columns:
    df['end_year'] = pd.to_datetime(df['enddate']).dt.year

year_columns = [col for col in df1.columns if col.isdigit() and len(col) == 4]

gdp_long = df1.melt(
    id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
    value_vars=year_columns,
    var_name='Year',
    value_name='GDP_growth'
)

gdp_long['Year'] = gdp_long['Year'].astype(int)

gdp_long['Country Code'] = gdp_long['Country Code'].replace({'GBR': 'UKG'})

def calculate_avg_gdp(row, gdp_data):
    start_year = row['start_year']
    end_year = row['end_year']
    country_code = row['idacr']
    
    country_gdp = gdp_data[
        (gdp_data['Country Code'] == country_code) & 
        (gdp_data['Year'] >= start_year) & 
        (gdp_data['Year'] <= end_year)
    ]
    
    if len(country_gdp) > 0 and not country_gdp['GDP_growth'].isna().all():
        avg_gdp = country_gdp['GDP_growth'].mean()
        return avg_gdp
    else:
        return np.nan

df['avg_gdp_growth'] = df.apply(
    lambda row: calculate_avg_gdp(row, gdp_long), 
    axis=1
)

In [None]:
if 'start_year' not in df.columns:
    df['start_year'] = pd.to_datetime(df['startdate']).dt.year
if 'end_year' not in df.columns:
    df['end_year'] = pd.to_datetime(df['enddate']).dt.year

year_columns = [col for col in df2.columns if col.isdigit() and len(col) == 4]

gni_long = df2.melt(
    id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
    value_vars=year_columns,
    var_name='Year',
    value_name='GNI_per_capita'
)

gni_long['Year'] = gni_long['Year'].astype(int)

gni_long['Country Code'] = gni_long['Country Code'].replace({'GBR': 'UKG'})

def calculate_avg_gni(row, gni_data):
    start_year = row['start_year']
    end_year = row['end_year']
    country_code = row['idacr']
    
    country_gni = gni_data[
        (gni_data['Country Code'] == country_code) & 
        (gni_data['Year'] >= start_year) & 
        (gni_data['Year'] <= end_year)
    ]
    
    if len(country_gni) > 0 and not country_gni['GNI_per_capita'].isna().all():
        avg_gni = country_gni['GNI_per_capita'].mean()
        return avg_gni
    else:
        return np.nan

df['avg_gni_per_capita'] = df.apply(
    lambda row: calculate_avg_gni(row, gni_long), 
    axis=1
)

In [None]:
if 'start_year' not in df.columns:
    df['start_year'] = pd.to_datetime(df['startdate']).dt.year
if 'end_year' not in df.columns:
    df['end_year'] = pd.to_datetime(df['enddate']).dt.year

year_columns = [col for col in df3.columns if col.isdigit() and len(col) == 4]

inflation_long = df3.melt(
    id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
    value_vars=year_columns,
    var_name='Year',
    value_name='Inflation_rate'
)

inflation_long['Year'] = inflation_long['Year'].astype(int)

inflation_long['Country Code'] = inflation_long['Country Code'].replace({'GBR': 'UKG'})

def calculate_avg_inflation(row, inflation_data):
    start_year = row['start_year']
    end_year = row['end_year']
    country_code = row['idacr']
    
    country_inflation = inflation_data[
        (inflation_data['Country Code'] == country_code) & 
        (inflation_data['Year'] >= start_year) & 
        (inflation_data['Year'] <= end_year)
    ]
    
    if len(country_inflation) > 0 and not country_inflation['Inflation_rate'].isna().all():
        avg_inflation = country_inflation['Inflation_rate'].mean()
        return avg_inflation
    else:
        return np.nan

df['avg_inflation_rate'] = df.apply(
    lambda row: calculate_avg_inflation(row, inflation_long), 
    axis=1
)

In [None]:
if 'start_year' not in df.columns:
    df['start_year'] = pd.to_datetime(df['startdate']).dt.year
if 'end_year' not in df.columns:
    df['end_year'] = pd.to_datetime(df['enddate']).dt.year

year_columns = [col for col in df4.columns if col.isdigit() and len(col) == 4]

child_mortality_long = df4.melt(
    id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
    value_vars=year_columns,
    var_name='Year',
    value_name='Child_mortality_rate'
)

child_mortality_long['Year'] = child_mortality_long['Year'].astype(int)

child_mortality_long['Country Code'] = child_mortality_long['Country Code'].replace({'GBR': 'UKG'})

def calculate_avg_child_mortality(row, child_mortality_data):
    start_year = row['start_year']
    end_year = row['end_year']
    country_code = row['idacr']
    
    country_mortality = child_mortality_data[
        (child_mortality_data['Country Code'] == country_code) & 
        (child_mortality_data['Year'] >= start_year) & 
        (child_mortality_data['Year'] <= end_year)
    ]
    
    if len(country_mortality) > 0 and not country_mortality['Child_mortality_rate'].isna().all():
        avg_mortality = country_mortality['Child_mortality_rate'].mean()
        return avg_mortality
    else:
        return np.nan

df['avg_child_mortality_rate'] = df.apply(
    lambda row: calculate_avg_child_mortality(row, child_mortality_long), 
    axis=1
)


In [None]:
if 'start_year' not in df.columns:
    df['start_year'] = pd.to_datetime(df['startdate']).dt.year
if 'end_year' not in df.columns:
    df['end_year'] = pd.to_datetime(df['enddate']).dt.year

year_columns = [col for col in df5.columns if col.isdigit() and len(col) == 4]

population_growth_long = df5.melt(
    id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
    value_vars=year_columns,
    var_name='Year',
    value_name='Population_growth_rate'
)

population_growth_long['Year'] = population_growth_long['Year'].astype(int)

population_growth_long['Country Code'] = population_growth_long['Country Code'].replace({'GBR': 'UKG'})

def calculate_avg_population_growth(row, population_data):
    start_year = row['start_year']
    end_year = row['end_year']
    country_code = row['idacr']
    
    country_population = population_data[
        (population_data['Country Code'] == country_code) & 
        (population_data['Year'] >= start_year) & 
        (population_data['Year'] <= end_year)
    ]
    
    if len(country_population) > 0 and not country_population['Population_growth_rate'].isna().all():
        avg_growth = country_population['Population_growth_rate'].mean()
        return avg_growth
    else:
        return np.nan

df['avg_population_growth_rate'] = df.apply(
    lambda row: calculate_avg_population_growth(row, population_growth_long), 
    axis=1
)


In [None]:
df_policy=pd.read_csv("With_polity.csv")
policy_mapping = df_policy.set_index('obsid')['polity_mean'].to_dict()

df['polity_mean'] = df['obsid'].map(policy_mapping)
df.to_csv("archigo 7.0.csv")

In [None]:
import pandas as pd

df_colony = pd.read_csv("contcol.csv")

def process_colonial_data(df_colony):
    colonial_records = []
    
    for _, row in df_colony.iterrows():
        if pd.isna(row['begin']) or pd.isna(row['end']):
            continue
            
        begin_year = int(row['begin'])
        end_year = int(row['end'])
        
        if (end_year - begin_year) >= 10:
            colonial_records.append({
                'colonized_country': row['statehab'],
                'colonial_power': row['statelab'],
                'begin_year': begin_year,
                'end_year': end_year,
                'duration': end_year - begin_year
            })
    
    df_colonial = pd.DataFrame(colonial_records)
    
    if len(df_colonial) == 0:
        return {}, {}, {}
    
    df_colonial_sorted = df_colonial.sort_values(['colonized_country', 'end_year'], ascending=[True, False])
    df_final = df_colonial_sorted.drop_duplicates('colonized_country', keep='first')
    
    power_mapping = df_final.set_index('colonized_country')['colonial_power'].to_dict()
    begin_mapping = df_final.set_index('colonized_country')['begin_year'].to_dict()
    end_mapping = df_final.set_index('colonized_country')['end_year'].to_dict()
    
    return power_mapping, begin_mapping, end_mapping

def add_colonial_status(df, df_colony):
    power_mapping, begin_mapping, end_mapping = process_colonial_data(df_colony)
    
    df['was_colonized'] = df['idacr'].isin(power_mapping.keys())
    df['colonial_power'] = df['idacr'].map(power_mapping).fillna('')
    df['colonial_begin'] = df['idacr'].map(begin_mapping).fillna('')
    df['colonial_end'] = df['idacr'].map(end_mapping).fillna('')
    df['colonial_duration'] = ''
    
    colonial_durations = {}
    for country in power_mapping.keys():
        if country in begin_mapping and country in end_mapping:
            duration = end_mapping[country] - begin_mapping[country]
            colonial_durations[country] = f"{duration}年"
    
    df['colonial_duration'] = df['idacr'].map(colonial_durations).fillna('')
    
    return df

df = add_colonial_status(df, df_colony)

In [None]:
def add_tenure_duration(df):
    df['startdate'] = pd.to_datetime(df['startdate'])
    df['enddate'] = pd.to_datetime(df['enddate'])
    
    df['tenure_days'] = (df['enddate'] - df['startdate']).dt.days
    df['tenure_years'] = df['tenure_days'] / 365.25

add_tenure_duration(df)
df.to_csv("archigos 8.0.csv")