# Lab: NumPy and Pandas

### 1. Pandas

* Read in file `life_expectancy_data.csv`

In [21]:
import numpy as np
import pandas as pd
import os

In [22]:
life_expectancy_data = pd.read_csv('/Users/Fede/Desktop/PhD - Courses/Python/PythonCamp2024/Day07/Lab/life_expectancy_data.csv')

* Rename columns 

1. Make them all lower case, remove all hyphens, whitespace, and slashes.
2. Make "Income composition of resources" --> "composition", and "BMI" --> "avg_bmi".
3. Change the name of "under-five deaths" to something else of your choosing.

In [23]:
# Colnames to lowercase
life_expectancy_data.columns = life_expectancy_data.columns.str.lower()

# Remove
life_expectancy_data.columns = life_expectancy_data.columns.str.replace(r'[-\s/]', '', regex=True)

# Rename columns
life_expectancy_data.rename(columns={
    'incomecompositionofresources': 'composition',
    'bmi': 'avg_bmi',
    'underfivedeaths': 'child_mortality'
}, inplace=True)

life_expectancy_data.head

<bound method NDFrame.head of           country  year      status  lifeexpectancy  adultmortality  \
0     Afghanistan  2015  Developing            65.0           263.0   
1     Afghanistan  2014  Developing            59.9           271.0   
2     Afghanistan  2013  Developing            59.9           268.0   
3     Afghanistan  2012  Developing            59.5           272.0   
4     Afghanistan  2011  Developing            59.2           275.0   
...           ...   ...         ...             ...             ...   
2933     Zimbabwe  2004  Developing            44.3           723.0   
2934     Zimbabwe  2003  Developing            44.5           715.0   
2935     Zimbabwe  2002  Developing            44.8            73.0   
2936     Zimbabwe  2001  Developing            45.3           686.0   
2937     Zimbabwe  2000  Developing            46.0           665.0   

      infantdeaths  alcohol  percentageexpenditure  hepatitisb  measles  ...  \
0               62     0.01          

* Feature engineering

1. Combine the two "thinness" variables somehow
2. Transform the GDP and Population variables somehow
3. Make country and status variables into dummies. Give them a prefix of 'dum' to keep track of them
4. Combine GDP and one of the expenditure variables into a new variable

In [24]:

# Combine (average)
life_expectancy_data['thinness_combined'] = life_expectancy_data[['thinness119years', 'thinness59years']].mean(axis=1)

# Log tansf 
life_expectancy_data['log_gdp'] = np.log(life_expectancy_data['gdp'])
life_expectancy_data['log_population'] = np.log(life_expectancy_data['population'])

# Making dummies
life_expectancy_data = pd.get_dummies(life_expectancy_data, columns=['country', 'status'], prefix='dum')

# GDP + total expenditures
life_expectancy_data['gdp_expenditure'] = life_expectancy_data['gdp'] * life_expectancy_data['totalexpenditure']


life_expectancy_data.head()


Unnamed: 0,year,lifeexpectancy,adultmortality,infantdeaths,alcohol,percentageexpenditure,hepatitisb,measles,avg_bmi,child_mortality,...,dum_Uzbekistan,dum_Vanuatu,dum_Venezuela (Bolivarian Republic of),dum_Viet Nam,dum_Yemen,dum_Zambia,dum_Zimbabwe,dum_Developed,dum_Developing,gdp_expenditure
0,2015,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,83,...,False,False,False,False,False,False,False,False,True,4767.555154
1,2014,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,86,...,False,False,False,False,False,False,False,False,True,5011.857485
2,2013,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,89,...,False,False,False,False,False,False,False,False,True,5136.086655
3,2012,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,93,...,False,False,False,False,False,False,False,False,True,5708.05068
4,2011,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,97,...,False,False,False,False,False,False,False,False,True,500.038008


In [25]:
life_expectancy_data[[col for col in life_expectancy_data if col.startswith('dum')]]

Unnamed: 0,dum_Afghanistan,dum_Albania,dum_Algeria,dum_Angola,dum_Antigua and Barbuda,dum_Argentina,dum_Armenia,dum_Australia,dum_Austria,dum_Azerbaijan,...,dum_Uruguay,dum_Uzbekistan,dum_Vanuatu,dum_Venezuela (Bolivarian Republic of),dum_Viet Nam,dum_Yemen,dum_Zambia,dum_Zimbabwe,dum_Developed,dum_Developing
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
2934,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
2935,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
2936,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True


* Summarize data

1. How many observations are there for developing countries?
2. Drop all observations with missing values. How many rows were there before and how many are there now?
3. How old is the oldest observation?

In [51]:

summary = life_expectancy_data.describe(include='all')
print("Summary of the data:\n", summary)

# How many observations are there for developing countries?
developing_count = life_expectancy_data['dum_Developing'].sum()
print("Number of observations for developing countries:", developing_count)

# Drop all observations with missing values. How many rows were there before and how many are there now?
rows_before = life_expectancy_data.shape[0]
life_expectancy_data_cleaned = life_expectancy_data.dropna()

# Count the number of rows after dropping
rows_after = life_expectancy_data_cleaned.shape[0]
print(f"Number of rows before dropping missing values: {rows_before}")
print(f"Number of rows after dropping missing values: {rows_after}")

# How old is the oldest observation?
oldest_observation_year = life_expectancy_data['year'].min()
print("The oldest observation is from the year:", oldest_observation_year)

Summary of the data:
                year  lifeexpectancy  adultmortality  infantdeaths  \
count   2938.000000     2928.000000     2928.000000   2938.000000   
unique          NaN             NaN             NaN           NaN   
top             NaN             NaN             NaN           NaN   
freq            NaN             NaN             NaN           NaN   
mean    2007.518720       69.224932      164.796448     30.303948   
std        4.613841        9.523867      124.292079    117.926501   
min     2000.000000       36.300000        1.000000      0.000000   
25%     2004.000000       63.100000       74.000000      0.000000   
50%     2008.000000       72.100000      144.000000      3.000000   
75%     2012.000000       75.700000      228.000000     22.000000   
max     2015.000000       89.000000      723.000000   1800.000000   

            alcohol  percentageexpenditure   hepatitisb        measles  \
count   2744.000000            2938.000000  2385.000000    2938.000000   
u

* Generate outputs

1. Export the new dataset as a CSV in your lab folder.
2. Keep the following columns in a new dataset called `le2`: all your engineered features plus Life Expectancy, Schooling, GDP, total expenditures, and BMI. Drop everything else.

In [33]:

# Export the new dataset as a CSV
life_expectancy_data.to_csv('/Users/Fede/Desktop/PhD - Courses/Python/PythonCamp2024/Day07/Lab/le_cleaned.csv', index=False)


# Keep only
columns_to_keep = [
    'thinness_combined', 'log_gdp', 'log_population', 'gdp_expenditure', 'schooling',
    'dum_Developing', 'dum_Developed', 'lifeexpectancy', 'gdp', 'totalexpenditure', 'avg_bmi'
]

le2 = life_expectancy_data[columns_to_keep]

# Export le2
le2.to_csv('/Users/Fede/Desktop/PhD - Courses/Python/PythonCamp2024/Day07/Lab/le2.csv', index=False)


### 2. NumPy

* Define the following NumPy arrays:
    1. `X`, containing the `schooling` and `gdp` variables in `le2`
    2. `y`, containing only the life expectancy variable

In [36]:
X = le2[['schooling', 'gdp']].to_numpy()
y = le2['lifeexpectancy'].to_numpy()
print(X)
print(y)

[[ 10.1       584.25921  ]
 [ 10.        612.696514 ]
 [  9.9       631.744976 ]
 ...
 [ 10.         57.34834  ]
 [  9.8       548.587312 ]
 [  9.8       547.3588785]]
[65.  59.9 59.9 ... 44.8 45.3 46. ]


* Regression by hand
    1. Using linear algebra operations, perform OLS to estimate $\beta = (X^TX)^{-1}X^TY$. What is the coefficient estimate for the effect of schooling on life expectancy?

In [43]:
print(np.isnan(X).sum())  # Should be 0
print(np.isnan(y).sum())  # Should be 0

valid_indices = ~np.isnan(X).any(axis=1) & ~np.isnan(y)
X = X[valid_indices]
y = y[valid_indices]

print(np.isnan(X).sum())  # Should be 0
print(np.isnan(y).sum())  # Should be 0


611
10
0
0


In [54]:

XtX = np.dot(X.T, X)
XtX_inv = np.linalg.inv(XtX)
Xty = np.dot(X.T, y)
beta = np.dot(XtX_inv, Xty)

schooling_coefficient = beta[0]
np.round(schooling_coefficient,3)



np.float64(nan)

### 3. Random Forest

* Train a random forest classifier to predict whether a country is "developed" in a given year based on all the other features, including life expectancy.

In [55]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


y = le2['dum_Developed']
X = le2.drop(columns=['dum_Developed', 'dum_Developing'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Output the results
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Accuracy: 0.9693877551020408
Confusion Matrix:
 [[479  12]
 [  6  91]]
Classification Report:
               precision    recall  f1-score   support

       False       0.99      0.98      0.98       491
        True       0.88      0.94      0.91        97

    accuracy                           0.97       588
   macro avg       0.94      0.96      0.95       588
weighted avg       0.97      0.97      0.97       588



* **Bonus:** Can you figure out how to extract feature importance from the classifier?