# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import plotly.graph_objects as go


# Load Data Set

In [2]:
def read_csv(file_path):
    df = pd.read_csv(file_path, delimiter=',', skiprows=3)
    print(df)
    return df
df=read_csv("API_SP.POP.TOTL_DS2_en_csv_v2_85.csv")

                    Country Name Country Code     Indicator Name  \
0                          Aruba          ABW  Population, total   
1    Africa Eastern and Southern          AFE  Population, total   
2                    Afghanistan          AFG  Population, total   
3     Africa Western and Central          AFW  Population, total   
4                         Angola          AGO  Population, total   
..                           ...          ...                ...   
261                       Kosovo          XKX  Population, total   
262                  Yemen, Rep.          YEM  Population, total   
263                 South Africa          ZAF  Population, total   
264                       Zambia          ZMB  Population, total   
265                     Zimbabwe          ZWE  Population, total   

    Indicator Code         1960         1961         1962         1963  \
0      SP.POP.TOTL      54608.0      55811.0      56682.0      57475.0   
1      SP.POP.TOTL  130692579.0  13

# Column Classification 

In [4]:
def classify_columns(data_frame):
    categorical_columns = data_frame.select_dtypes(include=['object']).columns
    numeric_columns = data_frame.select_dtypes(include=['int64', 'float64']).columns
    continuous_columns = [col for col in numeric_columns if col not in categorical_columns]
    
    return list(categorical_columns), list(continuous_columns)
categorical_columns, continuous_columns = classify_columns(df)
print("Categorical Columns:", categorical_columns)
print("Continuous Columns:", continuous_columns)


Categorical Columns: ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code']
Continuous Columns: ['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', 'Unnamed: 67']


# Pre Processing

In [4]:
def preprocess_categorical_data(data_frame, column_name):
    imputer = SimpleImputer(strategy='most_frequent')
    transformed_values = imputer.fit_transform(data_frame[[column_name]])
    data_frame[column_name] = transformed_values.ravel()
    return data_frame

def preprocess_numeric_data(data_frame, column_name):
    if data_frame[column_name].dtype != 'object':  
        imputer = SimpleImputer(strategy='median')
        data_frame[column_name] = imputer.fit_transform(data_frame[[column_name]])
        scaler = StandardScaler()
        data_frame[column_name] = scaler.fit_transform(data_frame[[column_name]])
    return data_frame

for column in categorical_columns:
    dataset = preprocess_categorical_data(data, column)


for column in continuous_columns:
    dataset = preprocess_numeric_data(data, column)

print(dataset)

                    Country Name Country Code     Indicator Name  \
0                          Aruba          ABW  Population, total   
1    Africa Eastern and Southern          AFE  Population, total   
2                    Afghanistan          AFG  Population, total   
3     Africa Western and Central          AFW  Population, total   
4                         Angola          AGO  Population, total   
..                           ...          ...                ...   
261                       Kosovo          XKX  Population, total   
262                  Yemen, Rep.          YEM  Population, total   
263                 South Africa          ZAF  Population, total   
264                       Zambia          ZMB  Population, total   
265                     Zimbabwe          ZWE  Population, total   

    Indicator Code      1960      1961      1962      1963      1964  \
0      SP.POP.TOTL -0.316561 -0.317007 -0.317103 -0.316893 -0.316714   
1      SP.POP.TOTL  0.038834  0.043410 



# Feature Selection

In [20]:
categorical_columns = ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code']

# Selecting continuous columns (assuming they are from '1960' to '2022')
continuous_columns = [str(year) for year in range(1960, 2023)]

# Printing the selected columns
print("Categorical Columns:", categorical_columns)
print("Continuous Columns:", continuous_columns)

Categorical Columns: ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code']
Continuous Columns: ['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']


# Outliers

In [7]:
def remove_numeric_outliers(df, columns, multiplier=1):
    for column in columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - multiplier * iqr
        upper_bound = q3 + multiplier * iqr
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df
df = remove_numeric_outliers(dataset, continuous_columns)
print(df)


               Country Name Country Code     Indicator Name Indicator Code  \
0                     Aruba          ABW  Population, total    SP.POP.TOTL   
5                   Albania          ALB  Population, total    SP.POP.TOTL   
6                   Andorra          AND  Population, total    SP.POP.TOTL   
10                  Armenia          ARM  Population, total    SP.POP.TOTL   
11           American Samoa          ASM  Population, total    SP.POP.TOTL   
..                      ...          ...                ...            ...   
255  British Virgin Islands          VGB  Population, total    SP.POP.TOTL   
256   Virgin Islands (U.S.)          VIR  Population, total    SP.POP.TOTL   
258                 Vanuatu          VUT  Population, total    SP.POP.TOTL   
260                   Samoa          WSM  Population, total    SP.POP.TOTL   
261                  Kosovo          XKX  Population, total    SP.POP.TOTL   

         1960      1961      1962      1963      1964      1965

# Line graph

In [43]:
country_batches = [df['Country Name'].unique()[i:i+5] for i in range(0, len(df['Country Name'].unique()), 10)]
for batch in country_batches:
    traces = []
    for country in batch:
        country_data = df[df['Country Name'] == country]
        population_data = country_data.iloc[:, 4:].values.tolist()[0]
        trace = go.Scatter(x=continuous_columns, y=population_data, mode='lines', name=country)
        traces.append(trace)

    layout = go.Layout(title='Population Distribution Over Time',
                       xaxis=dict(title='Year'),
                       yaxis=dict(title='Population', type='log'),  # Set y-axis scale to logarithmic
                       margin=dict(b=50))

    fig = go.Figure(data=traces, layout=layout)
    fig.show()


# Bar Chart

In [44]:
countries = df['Country Name'].unique()
chunks = [countries[i:i + 3] for i in range(0, len(countries), 10)]
for chunk in chunks:
    traces = []
    for country in chunk:
        country_data = df[df['Country Name'] == country]
        population_data = country_data.iloc[:, 4:].values.tolist()[0]  
        trace = go.Bar(x=continuous_columns, y=population_data, name=country)
        traces.append(trace)
    layout = go.Layout(title='Population Distribution Over Time',
                       xaxis=dict(title='Year'),
                       yaxis=dict(title='Population',type='log'),
                       barmode='stack')  
    fig = go.Figure(data=traces, layout=layout)
    fig.show()
