In [1]:
# Andres Melendez
# This script loads the WHO dataset, retrieves information about it.

import pandas as pd 

file_path = '../data/Mod4/WHO_first9cols.csv' # Path to the WHO dataset

try:
    # Load the data
    who_data = pd.read_csv(file_path)
        
    # Number of rows in the dataset
    num_rows = who_data.shape[0]
        
    # Column headers
    column_headers = who_data.columns
        
    # Data types of each column
    data_types = who_data.dtypes
        
    # Index information
    index_info = who_data.index

    

except FileNotFoundError as e:
    # Catch any error that occurs and display it
    print(f"Error: {e}")
except Exception as e:
    # Catch any error that occurs and display it
    print(f"An unexpected error occurred: {e}")
    
        
# Print the first 5 rows of the dataset
print(f'WHO Data (First 5 rows): \n {who_data.head()}')



WHO Data (First 5 rows): 
        Country  CountryID  Continent  Adolescent fertility rate (%)  \
0  Afghanistan          1          1                          151.0   
1      Albania          2          2                           27.0   
2      Algeria          3          3                            6.0   
3      Andorra          4          2                            NaN   
4       Angola          5          3                          146.0   

   Adult literacy rate (%)  \
0                     28.0   
1                     98.7   
2                     69.9   
3                      NaN   
4                     67.4   

   Gross national income per capita (PPP international $)  \
0                                                NaN        
1                                             6000.0        
2                                             5940.0        
3                                                NaN        
4                                             3890.0        

In [2]:
# Display key information about the dataset
print(f"Number of rows: {num_rows}")
print(f"\nColumn headers: {', '.join(column_headers)}")
print(f"\nData types:\n{data_types}")
print(f"\nIndex information: {index_info}")


Number of rows: 202

Column headers: Country, CountryID, Continent, Adolescent fertility rate (%), Adult literacy rate (%), Gross national income per capita (PPP international $), Net primary school enrolment ratio female (%), Net primary school enrolment ratio male (%), Population (in thousands) total

Data types:
Country                                                    object
CountryID                                                   int64
Continent                                                   int64
Adolescent fertility rate (%)                             float64
Adult literacy rate (%)                                   float64
Gross national income per capita (PPP international $)    float64
Net primary school enrolment ratio female (%)             float64
Net primary school enrolment ratio male (%)               float64
Population (in thousands) total                           float64
dtype: object

Index information: RangeIndex(start=0, stop=202, step=1)


In [3]:
# This script processes the "Country" column from the WHO dataset and provides detailed information on the column.
# Select the "Country" column
country_series = who_data['Country']

# Retrieve information about the column
country_dtype = country_series.dtype
country_shape = country_series.shape
country_index = country_series.index
country_values = country_series.values
country_name = country_series.name

# Display the results using f-strings
print(f'Country Data Type: {country_dtype}')
print(f'\nShape: {country_shape}') 
print(f'\nIndex: {country_index}')
print(f'\nValues: {country_values}')
print(f'\nName: {country_name}')


Country Data Type: object

Shape: (202,)

Index: RangeIndex(start=0, stop=202, step=1)

Values: ['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei Darussalam'
 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada'
 'Cape Verde' 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia'
 'Comoros' 'Congo, Dem. Rep.' 'Congo, Rep.' 'Cook Islands' 'Costa Rica'
 "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic' 'Denmark'
 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji'
 'Finland' 'France' 'French Polynesia' 'Gabon' 'Gambia' 'Georgia'
 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau'
 'Guyana' 'Haiti' 'Honduras' 'H

In [4]:
# This script retrieves data from the Quandl API (QDL/OPEC dataset), processes it, and performs various queries.
import quandl

# Function to retrieve and process the data from Quandl
def retrieve_opec_data():
    """
    Retrieves OPEC crude oil data from the Quandl API and performs the following:
    - Print the first and last few rows
    - Query the last value using the last date
    - Query data by specific date (YYYYMMDD format)
    - Query rows where 'Value' is greater than the mean crude oil price
    """
    try:
        # Set the Quandl API key
        quandl.ApiConfig.api_key = 'AFTHA5X_kPNRX2-Y1n-t'

        # Pull the data from the QDL/OPEC dataset
        data = quandl.get_table('QDL/OPEC')

        # Print the first and last few rows
        print("Head of the data:")
        print(data.head())

        print("\nTail of the data:")
        print(data.tail())

        # Query the last value using the last date
        last_date = data.index.max()
        last_value = data.loc[last_date]
        print(f"\nLast value on {last_date}:")
        print(last_value)

        # Query using a date string in the YYYYMMDD format
        date_str = '20231231'  # Example future date
        formatted_date = pd.to_datetime(date_str, format='%Y%m%d')
        if formatted_date in data.index:
            value_on_date = data.loc[formatted_date]
            print(f"\nValue on {formatted_date}:")
            print(value_on_date)
        else:
            print(f"\nNo data available for {formatted_date}")

        # Query with a Boolean: where 'Value' (representing crude oil price) is greater than the mean
        mean_value = data['value'].mean()
        greater_than_mean_value = data[data['value'] > mean_value]
        print("\nRows where crude oil price is greater than the mean:")
        print(greater_than_mean_value)

    except Exception as e:
        # Catch any error that occurs and display it
        print(f"An error occurred: {e}")



In [5]:
# Call the function to execute the code
retrieve_opec_data()




Head of the data:
           date  value
None                  
0    2024-01-25  81.98
1    2024-01-24  81.05
2    2024-01-23  81.30
3    2024-01-22  79.70
4    2024-01-19  80.27

Tail of the data:
           date  value
None                  
5430 2003-01-08  28.86
5431 2003-01-07  29.72
5432 2003-01-06  30.71
5433 2003-01-03  30.83
5434 2003-01-02  30.05

Last value on 5434:
date     2003-01-02 00:00:00
value                  30.05
Name: 5434, dtype: object

No data available for 2023-12-31 00:00:00

Rows where crude oil price is greater than the mean:
           date  value
None                  
0    2024-01-25  81.98
1    2024-01-24  81.05
2    2024-01-23  81.30
3    2024-01-22  79.70
4    2024-01-19  80.27
...         ...    ...
4510 2006-08-01  69.97
4520 2006-07-18  70.29
4521 2006-07-17  71.40
4522 2006-07-14  71.71
4523 2006-07-13  70.27

[2513 rows x 2 columns]


In [6]:
# This script processes OPEC crude oil data from Quandl, removing NaN values and calculating various statistics.
# Function to retrieve and process OPEC crude oil data from Quandl
def retrieve_and_analyze_opec_data():
    """
    Retrieves OPEC crude oil data from the Quandl API, filters out NaN values, and prints the following statistics:
    - Describe function
    - Count of observations
    - Mean absolute deviation (mad)
    - Mean
    - Median
    - Maximum value
    - Minimum value
    - Mode
    - Standard deviation
    - Variance
    - Skewness
    """
    try:
        # Set the Quandl API key
        quandl.ApiConfig.api_key = 'AFTHA5X_kPNRX2-Y1n-t'

        # Pull the data from the QDL/OPEC dataset
        data = quandl.get_table('QDL/OPEC')

        # Drop rows with NaN values
        data_clean = data.dropna()

        # Filter only numeric columns
        numeric_data = data_clean.select_dtypes(include='number')

        # Print the result of the describe function
        print("Describe function output:")
        print(numeric_data.describe())

        # Print the count of observations
        print("\nCount of observations:")
        print(numeric_data.count())

        # Calculate and print the Mean Absolute Deviation (MAD)
        print("\nMean Absolute Deviation (MAD):")
        mad = numeric_data.sub(numeric_data.mean()).abs().mean()
        print(mad)

        # Print the mean
        print("\nMean:")
        print(numeric_data.mean())

        # Print the median
        print("\nMedian:")
        print(numeric_data.median())

        # Print the maximum value
        print("\nMax:")
        print(numeric_data.max())

        # Print the minimum value
        print("\nMin:")
        print(numeric_data.min())

        # Print the mode
        print("\nMode:")
        print(numeric_data.mode().iloc[0])  # Taking the first mode value for each column

        # Print the standard deviation
        print("\nStandard Deviation:")
        print(numeric_data.std())

        # Print the variance
        print("\nVariance:")
        print(numeric_data.var())

        # Print the skewness
        print("\nSkewness:")
        print(numeric_data.skew())

    except Exception as e:
        print(f"An error occurred: {e}")

# Call the function to execute the code
retrieve_and_analyze_opec_data()


Describe function output:
             value
count  5435.000000
mean     69.908598
std      26.418165
min      12.220000
25%      49.490000
50%      67.000000
75%      90.910000
max     140.730000

Count of observations:
value    5435
dtype: int64

Mean Absolute Deviation (MAD):
value    22.050051
dtype: float64

Mean:
value    69.908598
dtype: float64

Median:
value    67.0
dtype: float64

Max:
value    140.73
dtype: float64

Min:
value    12.22
dtype: float64

Mode:
value    108.08
Name: 0, dtype: float64

Standard Deviation:
value    26.418165
dtype: float64

Variance:
value    697.919428
dtype: float64

Skewness:
value    0.244386
dtype: float64


In [7]:
# This script generates random data using numpy and processes it by grouping on columns for further analysis.
import numpy as np

# Function to generate random data for Weather, Food, Price, and Number columns
def generate_random_dataframe():
    """
    Generates a DataFrame with random data for the columns: Weather, Food, Price, and Number.
    Groups the data by the 'Weather' column and iterates through the groups (hot and cold).
    """
    try:
        # Generate random data
        np.random.seed(42)  # For reproducibility
        weather = np.random.choice(['hot', 'cold'], size=7)
        food = np.random.choice(['pizza', 'burger', 'icecream'], size=7)
        price = np.random.uniform(5, 7, size=7)
        number = np.random.randint(1, 7, size=7)

        # Create the DataFrame
        df = pd.DataFrame({
            'Weather': weather,
            'Food': food,
            'Price': price,
            'Number': number
        })
        
        return df
        
    except Exception as e:
        print(f"An error occurred: {e}")


In [8]:
# Groups the generated DataFrame by the 'Weather' column and prints each group.
try:
        df = generate_random_dataframe()

        if df is not None:
                # Generate the random DataFrame and group by 'Weather'
                weather_group = df.groupby('Weather')

                # Iterate through the groups and print each group's data
                for weather_type, group in weather_group:
                        print(f"\nGroup: {weather_type}")
                        print(group)

except Exception as e:
        print(f"An error occurred while processing the groups: {e}")



Group: cold
  Weather      Food     Price  Number
1    cold  icecream  6.202230       5
5    cold  icecream  6.664885       6

Group: hot
  Weather      Food     Price  Number
0     hot     pizza  6.732352       4
2     hot    burger  6.416145       1
3     hot  icecream  5.041169       4
4     hot  icecream  6.939820       2
6     hot  icecream  5.424678       5


In [9]:
# Groups the data by the 'Weather' column and performs aggregation on the groups.
# Print the first row, last row, and mean for each group
print("First row of each Weather group:")
print(weather_group.first())

print("\nLast row of each Weather group:")
print(weather_group.last())

print("\nMean values of each Weather group:")
print(weather_group.mean(numeric_only=True))



First row of each Weather group:
             Food     Price  Number
Weather                            
cold     icecream  6.202230       5
hot         pizza  6.732352       4

Last row of each Weather group:
             Food     Price  Number
Weather                            
cold     icecream  6.664885       6
hot      icecream  5.424678       5

Mean values of each Weather group:
            Price  Number
Weather                  
cold     6.433558     5.5
hot      6.110833     3.2


In [10]:
# Group the DataFrame by 'Weather' and 'Food'
try:
    # Generate the random DataFrame
    df = generate_random_dataframe()
    if df is not None:
        # Group the data by 'Weather' and 'Food'
        weather_food_group = df.groupby(['Weather', 'Food'])

        # Print the first row of each group
        print("First row of each Weather and Food group:")
        print(weather_food_group.first())

        # Print the last row of each group
        print("\nLast row of each Weather and Food group:")
        print(weather_food_group.last())

        # Print the mean values of each group for 'Price' and 'Number'
        print("\nMean values of each Weather and Food group:")
        print(weather_food_group.mean(numeric_only=True))
    
    else:
        print("Data generation failed.")

except Exception as e:
    # Handle any exceptions that may occur during the grouping or analysis
    print(f"An error occurred during grouping or analysis: {e}")


First row of each Weather and Food group:
                     Price  Number
Weather Food                      
cold    icecream  6.202230       5
hot     burger    6.416145       1
        icecream  5.041169       4
        pizza     6.732352       4

Last row of each Weather and Food group:
                     Price  Number
Weather Food                      
cold    icecream  6.664885       6
hot     burger    6.416145       1
        icecream  5.424678       5
        pizza     6.732352       4

Mean values of each Weather and Food group:
                     Price    Number
Weather Food                        
cold    icecream  6.433558  5.500000
hot     burger    6.416145  1.000000
        icecream  5.801889  3.666667
        pizza     6.732352  4.000000


In [11]:
# Aggregate to find mean and median for 'Number' and 'Price'
agg_results = weather_food_group.agg({
    'Number': ['mean', 'median'],
    'Price': ['mean', 'median']
})

# Displaying the aggregated results
print("Aggregation (mean and median) for 'Number' and 'Price' based on 'Weather' and 'Food':")
print(agg_results)


Aggregation (mean and median) for 'Number' and 'Price' based on 'Weather' and 'Food':
                    Number            Price          
                      mean median      mean    median
Weather Food                                         
cold    icecream  5.500000    5.5  6.433558  6.433558
hot     burger    1.000000    1.0  6.416145  6.416145
        icecream  3.666667    4.0  5.801889  5.424678
        pizza     4.000000    4.0  6.732352  6.732352


In [12]:
# Selects the first three rows of a DataFrame and concatenates them back with the original DataFrame.
try:
    # Generate the DataFrame
    df = generate_random_dataframe()

    # Select the first 3 rows
    selected_rows = df.iloc[:3]

    # Concatenate the selected rows back with the original DataFrame
    df_concat = pd.concat([df, selected_rows])

    # Display the concatenated DataFrame
    print("Original DataFrame with the first 3 rows concatenated back:")        
    print(df_concat)

except Exception as e:
    print(f"An error occurred while concatenating rows: {e}")
    raise


Original DataFrame with the first 3 rows concatenated back:
  Weather      Food     Price  Number
0     hot     pizza  6.732352       4
1    cold  icecream  6.202230       5
2     hot    burger  6.416145       1
3     hot  icecream  5.041169       4
4     hot  icecream  6.939820       2
5    cold  icecream  6.664885       6
6     hot  icecream  5.424678       5
0     hot     pizza  6.732352       4
1    cold  icecream  6.202230       5
2     hot    burger  6.416145       1


In [13]:
# This script selects the first 3 rows and the last 2 rows, then concatenates them.
# Select the first 3 rows of the DataFrame
selected_first_three = df.iloc[:3]

# Select the last 2 rows of the DataFrame
selected_last_two = df.iloc[-2:]

# Concatenate the selected rows together
concatenated_df = pd.concat([selected_first_three, selected_last_two], ignore_index=True)

# Print the result of the concatenation
print("Appended DataFrame with the first 3 rows and last 2 rows:")
print(concatenated_df)

Appended DataFrame with the first 3 rows and last 2 rows:
  Weather      Food     Price  Number
0     hot     pizza  6.732352       4
1    cold  icecream  6.202230       5
2     hot    burger  6.416145       1
3    cold  icecream  6.664885       6
4     hot  icecream  5.424678       5


In [14]:
# This function loads two datasets ('dest.csv' and 'tips.csv'), merges them on the 'EmpNr' column, and prints the resulting merged DataFrame.
try:
    # Load the two CSV files into DataFrames using the built-in pandas read_csv function
    dest_df = pd.read_csv('../data/Mod4/dest.csv')  # Loads destination dataset
    tips_df = pd.read_csv('../data/Mod4/tips.csv')  # Loads tips dataset

    # Merge the two datasets on the 'EmpNr' column
    merged_df = pd.merge(dest_df, tips_df, on='EmpNr')  # Performs the merge operation based on the common 'EmpNr' column

    # Print the result of the merge operation
    print("Merged DataFrame on 'EmpNr':")
    print(merged_df)

except Exception as e:
    # Handles any exception that may occur during the execution of the code
    print(f"An error occurred: {e}")  # Built-in function 'print' used to display error messages


Merged DataFrame on 'EmpNr':
   EmpNr       Dest  Amount
0      5  The Hague    10.0
1      9  Rotterdam     5.0


In [15]:
# Set 'EmpNr' as the index for both DataFrames before joining
dest_df.set_index('EmpNr', inplace=True)
tips_df.set_index('EmpNr', inplace=True)

# Set 'EmpNr' as the index for both DataFrames before joining
joined_df = dest_df.join(tips_df, how='inner')  # Use 'inner' for inner join

# Print the result of the join
print("Joined DataFrame on 'EmpNr':")
print(joined_df)


Joined DataFrame on 'EmpNr':
            Dest  Amount
EmpNr                   
5      The Hague    10.0
9      Rotterdam     5.0


In [16]:
# This script selects specific columns, checks for missing values, and replaces missing values in the WHO_first9cols.csv file.
# Select the first 3 rows for 'Country' and 'Net primary school enrolment ratio male (%)'
selected_data = who_data[['Country', 'Net primary school enrolment ratio male (%)']].head(3)

# Check for missing values (NaN) in the selected columns
missing_values = selected_data.isnull()

# Count the number of missing (NaN) values in each column
nan_count = selected_data.isnull().sum()

# Drop rows with missing values and store the non-missing data
non_missing_values = selected_data.dropna()

# Replace missing values with a scalar value (for example, -1)
replaced_data = selected_data.fillna(-1)

# Display the results using f-strings
print(f"Selected Data (First 3 Rows):\n{selected_data}")
print(f"\nMissing Values:\n{missing_values}")
print(f"\nNumber of NaN values:\n{nan_count}")
print(f"\nNon-missing values:\n{non_missing_values}")
print(f"\nData with missing values replaced:\n{replaced_data}")


Selected Data (First 3 Rows):
       Country  Net primary school enrolment ratio male (%)
0  Afghanistan                                          NaN
1      Albania                                         94.0
2      Algeria                                         96.0

Missing Values:
   Country  Net primary school enrolment ratio male (%)
0    False                                         True
1    False                                        False
2    False                                        False

Number of NaN values:
Country                                        0
Net primary school enrolment ratio male (%)    1
dtype: int64

Non-missing values:
   Country  Net primary school enrolment ratio male (%)
1  Albania                                         94.0
2  Algeria                                         96.0

Data with missing values replaced:
       Country  Net primary school enrolment ratio male (%)
0  Afghanistan                                         -1.0
1      Alb