In [1]:
"""
Author: Andres Melendez
Date: 2024-10-01
Description: This script filters earthquake data for events in Japan with a magnitude of 4.9 or greater using 
the 'mb' magnitude type, ensuring that the DataFrame includes the necessary columns.
"""

import pandas as pd

def filter_earthquakes_by_magnitude(df, location, min_magnitude, magnitude_type):
    """
    Filters earthquakes by a specific location keyword, magnitude type, and minimum magnitude.

    Args:
        df (pd.DataFrame): DataFrame containing earthquake data.
        location (str): The location keyword to filter (e.g., 'Japan').
        min_magnitude (float): The minimum magnitude.
        magnitude_type (str): The magnitude type to filter by (e.g., 'mb').

    Returns:
        pd.DataFrame: Filtered DataFrame containing earthquakes that meet the criteria.
    """
    try:
        # Ensure the DataFrame contains necessary columns
        required_columns = ['parsed_place', 'mag', 'magType']
        if not all(col in df.columns for col in required_columns):
            raise ValueError("Required columns are missing from the DataFrame")

        # Convert 'mag' column to numeric, handling any potential errors with invalid values
        df['mag'] = pd.to_numeric(df['mag'], errors='coerce')

        # Filter based on location, magnitude type, and magnitude value
        filtered_df = df[(df['parsed_place'] == location) & 
                         (df['magType'] == magnitude_type) & 
                         (df['mag'] >= min_magnitude)]
        return filtered_df

    except ValueError as e:
        # Capture and print errors related to missing columns
        print(f"Error: {e}")
        return None

    finally:
        # Optional cleanup actions, if necessary
        print("Filtering process completed.")



In [2]:
# Load the earthquake data from the uploaded CSV file
file_path = 'data/Mod6/earthquakes.csv'
earthquake_data = pd.read_csv(file_path)

# Call the function to filter earthquakes in Japan with mb magnitude >= 4.9
japan_earthquakes = filter_earthquakes_by_magnitude(earthquake_data, 'Japan', 4.9, 'mb')

# Display filtered DataFrame only if it is not empty or None
if japan_earthquakes is not None and not japan_earthquakes.empty:
    print(japan_earthquakes.to_string())  # Print the filtered DataFrame
else:
    print("No data to display.")


Filtering process completed.
      mag magType           time                         place  tsunami parsed_place
1563  4.9      mb  1538977532250  293km ESE of Iwo Jima, Japan        0        Japan
2576  5.4      mb  1538697528010    37km E of Tomakomai, Japan        0        Japan
3072  4.9      mb  1538579732490     15km ENE of Hasaki, Japan        0        Japan
3632  4.9      mb  1538450871260    53km ESE of Hitachi, Japan        0        Japan


In [3]:
# Filter and modify earthquake_data in one step, using .loc to avoid SettingWithCopyWarning
earthquake_data.loc[earthquake_data['magType'] == 'ml', 'mag'] = pd.to_numeric(earthquake_data['mag'], errors='coerce')

# Define the bin edges
bins = pd.interval_range(start=0, end=int(earthquake_data.loc[earthquake_data['magType'] == 'ml', 'mag'].max()) + 1, freq=1)

# Bin the data directly in earthquake_data
earthquake_data.loc[earthquake_data['magType'] == 'ml', 'binned_mag'] = pd.cut(earthquake_data['mag'], bins=bins)

# Count how many earthquakes fall into each bin
binned_counts = earthquake_data.loc[earthquake_data['magType'] == 'ml', 'binned_mag'].value_counts().sort_index()

# Display the binned counts
print(binned_counts)


binned_mag
(0, 1]    2207
(1, 2]    3105
(2, 3]     862
(3, 4]     122
(4, 5]       2
(5, 6]       1
Name: count, dtype: int64


In [4]:
# Description: This script attempts to load the faang.csv file and displays the first few rows for data inspection.

def load_faang_data(file_path):
    """
    Function to load the FAANG data from a CSV file and display the first few rows.

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        pd.DataFrame: Loaded FAANG data as a Pandas DataFrame.
    """
    try:
        faang_data = pd.read_csv(file_path)
        print(faang_data.head())
        return faang_data

    except FileNotFoundError:
        print("Error: The file was not found at the specified path.")
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
    except pd.errors.ParserError:
        print("Error: There was an error parsing the file.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

# Sample usage
file_path = 'data/Mod6/faang.csv'
faang_data = load_faang_data(file_path)



  ticker        date        high         low        open       close  \
0     FB  2018-01-02  181.580002  177.550003  177.679993  181.419998   
1     FB  2018-01-03  184.779999  181.330002  181.880005  184.669998   
2     FB  2018-01-04  186.210007  184.100006  184.899994  184.330002   
3     FB  2018-01-05  186.899994  184.929993  185.589996  186.850006   
4     FB  2018-01-08  188.899994  186.330002  187.199997  188.279999   

       volume  
0  18151900.0  
1  16886600.0  
2  13880900.0  
3  13574500.0  
4  17994700.0  


In [5]:
# Convert the 'date' column to datetime format for resampling
faang_data['date'] = pd.to_datetime(faang_data['date'])

# Set 'date' as the DataFrame index
faang_data.set_index('date', inplace=True)

# Group by ticker and resample to monthly frequency, then apply the required aggregations
faang_monthly_agg = faang_data.groupby('ticker').resample('ME').agg({
    'open': 'mean',  # Mean of the opening price
    'high': 'max',   # Maximum of the high price
    'low': 'min',    # Minimum of the low price
    'close': 'mean', # Mean of the closing price
    'volume': 'sum'  # Sum of the volume traded
}).reset_index()

# Display the first few rows of the aggregated result
print(faang_monthly_agg.head())


  ticker       date       open       high        low      close        volume
0   AAPL 2018-01-31  43.505357  45.025002  41.174999  43.501309  2.638718e+09
1   AAPL 2018-02-28  41.819079  45.154999  37.560001  41.909737  3.711577e+09
2   AAPL 2018-03-31  43.761786  45.875000  41.235001  43.624048  2.854911e+09
3   AAPL 2018-04-30  42.441310  44.735001  40.157501  42.458572  2.664617e+09
4   AAPL 2018-05-31  46.239091  47.592499  41.317501  46.384205  2.483905e+09


In [6]:
# Convert the 'mag' column to numeric in case there are any issues with types
earthquake_data['mag'] = pd.to_numeric(earthquake_data['mag'], errors='coerce')

# Build a crosstab with the tsunami and magType columns, showing the maximum magnitude for each combination
crosstab_result = pd.crosstab(
    index=earthquake_data['tsunami'], 
    columns=earthquake_data['magType'], 
    values=earthquake_data['mag'], 
    aggfunc='max'
)

# Print the crosstab result
print(crosstab_result)

magType   mb  mb_lg    md   mh   ml  ms_20    mw  mwb  mwr  mww
tsunami                                                        
0        5.6    3.5  4.11  1.1  4.2    NaN  3.83  5.8  4.8  6.0
1        6.1    NaN   NaN  NaN  5.1    5.7  4.41  NaN  NaN  7.5


In [7]:
# Define the rolling window size
window_size = 60

# Group by ticker and calculate rolling aggregations
rolling_agg = faang_data.groupby('ticker').rolling(window=window_size).agg({
    'open': 'mean',    # Mean of the opening price
    'high': 'max',     # Maximum of the high price
    'low': 'min',      # Minimum of the low price
    'close': 'mean',   # Mean of the closing price
    'volume': 'sum'    # Sum of the volume traded
}).reset_index()

# Display the result
print(rolling_agg)

     ticker       date        open        high         low       close  \
0      AAPL 2018-01-02         NaN         NaN         NaN         NaN   
1      AAPL 2018-01-03         NaN         NaN         NaN         NaN   
2      AAPL 2018-01-04         NaN         NaN         NaN         NaN   
3      AAPL 2018-01-05         NaN         NaN         NaN         NaN   
4      AAPL 2018-01-08         NaN         NaN         NaN         NaN   
...     ...        ...         ...         ...         ...         ...   
1250   NFLX 2018-12-24  306.018001  386.799988  233.679993  303.239834   
1251   NFLX 2018-12-26  303.596001  386.799988  231.229996  301.232167   
1252   NFLX 2018-12-27  301.500334  386.799988  231.229996  299.134501   
1253   NFLX 2018-12-28  299.393001  380.929993  231.229996  297.116833   
1254   NFLX 2018-12-31  297.420168  380.000000  231.229996  295.293667   

           volume  
0             NaN  
1             NaN  
2             NaN  
3             NaN  
4          

In [8]:
# Create a pivot table with the average values for OHLC and volume traded by ticker
pivot_table = faang_data.pivot_table(
    index='ticker',               # Rows = ticker
    values=['open', 'high', 'low', 'close', 'volume'],  # Columns = OHLC and volume
    aggfunc='mean'                # Aggregation = mean (average)
)

# Display the pivot table
print(pivot_table)

              close         high          low         open        volume
ticker                                                                  
AAPL      47.263357    47.748526    46.795877    47.277859  1.360803e+08
AMZN    1641.726176  1662.839839  1619.840519  1644.072709  5.648994e+06
FB       171.510956   173.613347   169.303148   171.472948  2.765860e+07
GOOG    1113.225134  1125.777606  1101.001658  1113.554101  1.741965e+06
NFLX     319.290319   325.219322   313.187330   319.620558  1.146962e+07


In [10]:
# Load the uploaded faang.csv file
faang_file_path = 'data/Mod6/faang.csv'
faang_data = pd.read_csv(faang_file_path)

# Convert the 'date' column to datetime
faang_data['date'] = pd.to_datetime(faang_data['date'])

# Filter data for Amazon (AMZN) in Q4 2018
amzn_q4_2018 = faang_data[(faang_data['ticker'] == 'AMZN') & 
                          (faang_data['date'] >= '2018-10-01') & 
                          (faang_data['date'] <= '2018-12-31')]

# Calculate Z-scores for each numeric column (open, high, low, close, volume)
z_scores = amzn_q4_2018[['open', 'high', 'low', 'close', 'volume']].apply(
    lambda x: (x - x.mean()) / x.std()
)

# Display the Z-scores
print(z_scores)

         open      high       low     close    volume
690  2.337813  2.368006  2.502113  2.385848 -1.630411
691  2.190795  2.227302  2.247433  2.155037 -0.861879
692  2.068570  2.058955  2.139987  2.025489 -0.920345
693  1.850048  1.819474  1.781561  1.722816 -0.126582
694  1.642819  1.628173  1.554416  1.584748 -0.298771
..        ...       ...       ...       ...       ...
748 -2.179582 -2.159820 -2.187566 -2.226185 -0.141238
749 -2.026617 -1.611714 -1.810493 -1.339674  1.123063
750 -1.456521 -1.641276 -1.626703 -1.404343  0.849827
751 -1.328549 -1.325261 -1.231588 -1.289951  0.496102
752 -1.078283 -1.273456 -0.975763 -1.122691 -0.246405

[63 rows x 5 columns]


In [11]:
# Create the event DataFrame
events_data = pd.DataFrame({
    'ticker': ['FB', 'FB', 'FB'],
    'date': ['2018-07-25', '2018-03-19', '2018-03-20'],
    'event': ['Disappointing user growth announced after close.', 
              'Cambridge Analytica story', 
              'FTC investigation']
})

# Convert 'date' to datetime format
events_data['date'] = pd.to_datetime(events_data['date'])

# Set the index to ['date', 'ticker']
events_data.set_index(['date', 'ticker'], inplace=True)

# Set the FAANG data index to ['date', 'ticker'] for merging
faang_data.set_index(['date', 'ticker'], inplace=True)

# Perform an outer join between FAANG data and events data
merged_data = faang_data.merge(events_data, how='outer', left_index=True, right_index=True)

# Reset the index to make the merged data easier to view
merged_data.reset_index(inplace=True)

# Display the merged data
print(merged_data)

           date ticker         high          low         open        close  \
0    2018-01-02   AAPL    43.075001    42.314999    42.540001    43.064999   
1    2018-01-02   AMZN  1190.000000  1170.510010  1172.000000  1189.010010   
2    2018-01-02     FB   181.580002   177.550003   177.679993   181.419998   
3    2018-01-02   GOOG  1066.939941  1045.229980  1048.339966  1065.000000   
4    2018-01-02   NFLX   201.649994   195.419998   196.100006   201.070007   
...         ...    ...          ...          ...          ...          ...   
1250 2018-12-31   AAPL    39.840000    39.119999    39.632500    39.435001   
1251 2018-12-31   AMZN  1520.760010  1487.000000  1510.800049  1501.969971   
1252 2018-12-31     FB   134.639999   129.949997   134.449997   131.089996   
1253 2018-12-31   GOOG  1052.699951  1023.590027  1050.959961  1035.609985   
1254 2018-12-31   NFLX   270.100006   260.000000   260.160004   267.660004   

           volume event  
0     102223600.0   NaN  
1       269

In [12]:
# Load the FAANG data from your local environment
faang_file_path = 'data/Mod6/faang.csv'  # Replace with your local file path
faang_data = pd.read_csv(faang_file_path)

# Convert the 'date' column to datetime format
faang_data['date'] = pd.to_datetime(faang_data['date'])

# Sort the data by 'ticker' and 'date'
faang_data.sort_values(['ticker', 'date'], inplace=True)

# Group by 'ticker' and apply the transform method to represent the values in terms of the first date
# We'll apply the transformation only to the OHLC and volume columns
faang_transformed = faang_data.groupby('ticker')[['open', 'high', 'low', 'close', 'volume']].transform(
    lambda x: x / x.iloc[0]
)

# Add the date and ticker back into the transformed DataFrame for reference
faang_transformed['date'] = faang_data['date']
faang_transformed['ticker'] = faang_data['ticker']

# Display the transformed data
print(faang_transformed)


          open      high       low     close    volume       date ticker
251   1.000000  1.000000  1.000000  1.000000  1.000000 2018-01-02   AAPL
252   1.013928  1.013059  1.015952  0.999826  1.155033 2018-01-03   AAPL
253   1.013987  1.006790  1.016661  1.004470  0.877864 2018-01-04   AAPL
254   1.019276  1.017818  1.022392  1.015906  0.925814 2018-01-05   AAPL
255   1.024624  1.019211  1.027591  1.012133  0.804816 2018-01-08   AAPL
...        ...       ...       ...       ...       ...        ...    ...
999   1.234064  1.242995  1.195783  1.163177  0.870583 2018-12-24   NFLX
1000  1.192861  1.262088  1.183246  1.261600  1.313288 2018-12-26   NFLX
1001  1.275421  1.267493  1.228636  1.271050  1.115648 2018-12-27   NFLX
1002  1.315349  1.298835  1.278272  1.273586  1.002362 2018-12-28   NFLX
1003  1.326670  1.339450  1.330468  1.331178  1.231788 2018-12-31   NFLX

[1255 rows x 7 columns]
